diff options
Diffstat (limited to 'fs/gfs2')
73 files changed, 15835 insertions, 14785 deletions
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index de8e64c03f7..90c6a8faaec 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,8 +1,9 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL + depends on (64BIT || LBDAF) select FS_POSIX_ACL select CRC32 + select QUOTACTL help A cluster filesystem. @@ -14,33 +15,20 @@ config GFS2_FS GFS is perfect consistency -- changes made to the filesystem on one machine show up immediately on all other machines in the cluster. - To use the GFS2 filesystem, you will need to enable one or more of - the below locking modules. Documentation and utilities for GFS2 can + To use the GFS2 filesystem in a cluster, you will need to enable + the locking module below. Documentation and utilities for GFS2 can be found here: http://sources.redhat.com/cluster -config GFS2_FS_LOCKING_NOLOCK - tristate "GFS2 \"nolock\" locking module" - depends on GFS2_FS - help - Single node locking module for GFS2. - - Use this module if you want to use GFS2 on a single node without - its clustering features. You can still take advantage of the - large file support, and upgrade to running a full cluster later on - if required. - - If you will only be using GFS2 in cluster mode, you do not need this - module. + The "nolock" lock module is now built in to GFS2 by default. If + you want to use the DLM, be sure to enable IPv4/6 networking. config GFS2_FS_LOCKING_DLM - tristate "GFS2 DLM locking module" - depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n) - select IP_SCTP if DLM_SCTP - select CONFIGFS_FS - select DLM + bool "GFS2 DLM locking" + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) help Multiple node locking module for GFS2 - Most users of GFS2 will require this module. It provides the locking + Most users of GFS2 will require this. It provides the locking interface between GFS2 and the DLM, which is required to use GFS2 in a cluster environment. diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 8fff11058ce..86128202384 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,10 +1,10 @@ +ccflags-y := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ - glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ - mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ - ops_fstype.o ops_inode.o ops_super.o quota.o \ +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ + glops.o log.o lops.o main.o meta_io.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o inode.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o -obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ -obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ +gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1047a8c7226..3088e2a38e3 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -12,283 +12,110 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" #include "trans.h" #include "util.h" -#define ACL_ACCESS 1 -#define ACL_DEFAULT 0 - -int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode) +static const char *gfs2_acl_name(int type) { - struct posix_acl *acl; - int error; - - error = gfs2_acl_validate_remove(ip, access); - if (error) - return error; - - if (!er->er_data) - return -EINVAL; - - acl = posix_acl_from_xattr(er->er_data, er->er_data_len); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) { - *remove = 1; - return 0; - } - - error = posix_acl_valid(acl); - if (error) - goto out; - - if (access) { - error = posix_acl_equiv_mode(acl, mode); - if (!error) - *remove = 1; - else if (error > 0) - error = 0; + switch (type) { + case ACL_TYPE_ACCESS: + return GFS2_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return GFS2_POSIX_ACL_DEFAULT; } - -out: - posix_acl_release(acl); - return error; + return NULL; } -int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { - if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - if (!is_owner_or_cap(&ip->i_inode)) - return -EPERM; - if (S_ISLNK(ip->i_inode.i_mode)) - return -EOPNOTSUPP; - if (!access && !S_ISDIR(ip->i_inode.i_mode)) - return -EACCES; - - return 0; -} - -static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, - struct gfs2_ea_location *el, char **data, unsigned int *len) -{ - struct gfs2_ea_request er; - struct gfs2_ea_location el_this; - int error; - - if (!ip->i_di.di_eattr) - return 0; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - if (access) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - } else { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - } - er.er_type = GFS2_EATYPE_SYS; - - if (!el) - el = &el_this; - - error = gfs2_ea_find(ip, &er, el); - if (error) - return error; - if (!el->el_ea) - return 0; - if (!GFS2_EA_DATA_LEN(el->el_ea)) - goto out; - - er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); - er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); - error = -ENOMEM; - if (!er.er_data) - goto out; - - error = gfs2_ea_get_copy(ip, el, er.er_data); - if (error) - goto out_kfree; - - if (acl) { - *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); - if (IS_ERR(*acl)) - error = PTR_ERR(*acl); - } - -out_kfree: - if (error || !data) - kfree(er.er_data); - else { - *data = er.er_data; - *len = er.er_data_len; - } -out: - if (error || el == &el_this) - brelse(el->el_bh); - return error; -} - -/** - * gfs2_check_acl - Check an ACL to see if we're allowed to do something - * @inode: the file we want to do something to - * @mask: what we want to do - * - * Returns: errno - */ + struct gfs2_inode *ip = GFS2_I(inode); + struct posix_acl *acl; + const char *name; + char *data; + int len; -int gfs2_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl = NULL; - int error; + if (!ip->i_eattr) + return NULL; - error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); - if (error) - return error; + name = gfs2_acl_name(type); + if (name == NULL) + return ERR_PTR(-EINVAL); - if (acl) { - error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } + len = gfs2_xattr_acl_get(ip, name, &data); + if (len < 0) + return ERR_PTR(len); + if (len == 0) + return NULL; - return -EAGAIN; + acl = posix_acl_from_xattr(&init_user_ns, data, len); + kfree(data); + return acl; } -static int munge_mode(struct gfs2_inode *ip, mode_t mode) +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *dibh; int error; + int len; + char *data; + const char *name = gfs2_acl_name(type); - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - gfs2_assert_withdraw(sdp, - (ip->i_inode.i_mode & S_IFMT) == (mode & S_IFMT)); - ip->i_inode.i_mode = mode; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } - - gfs2_trans_end(sdp); + BUG_ON(name == NULL); - return 0; -} + if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; -int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_request er; - mode_t mode = ip->i_inode.i_mode; - int error; + if (type == ACL_TYPE_ACCESS) { + umode_t mode = inode->i_mode; - if (!sdp->sd_args.ar_posix_acl) - return 0; - if (S_ISLNK(ip->i_inode.i_mode)) - return 0; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = GFS2_EATYPE_SYS; + if (error == 0) + acl = NULL; - error = acl_get(dip, ACL_DEFAULT, &acl, NULL, - &er.er_data, &er.er_data_len); - if (error) - return error; - if (!acl) { - mode &= ~current->fs->umask; - if (mode != ip->i_inode.i_mode) - error = munge_mode(ip, mode); - return error; + if (mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } } - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) - goto out; - posix_acl_release(acl); - acl = clone; - - if (S_ISDIR(ip->i_inode.i_mode)) { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - error = gfs2_system_eaops.eo_set(ip, &er); - if (error) + if (acl) { + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) goto out; + } else { + data = NULL; + len = 0; } - error = posix_acl_create_masq(acl, &mode); - if (error < 0) - goto out; - if (error > 0) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - posix_acl_to_xattr(acl, er.er_data, er.er_data_len); - er.er_mode = mode; - er.er_flags = GFS2_ERF_MODE; - error = gfs2_system_eaops.eo_set(ip, &er); - if (error) - goto out; - } else - munge_mode(ip, mode); - -out: - posix_acl_release(acl); - kfree(er.er_data); - return error; -} - -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) -{ - struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_location el; - char *data; - unsigned int len; - int error; - - error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); if (error) - return error; - if (!acl) - return gfs2_setattr_simple(ip, attr); - - clone = posix_acl_clone(acl, GFP_KERNEL); - error = -ENOMEM; - if (!clone) goto out; - posix_acl_release(acl); - acl = clone; - - error = posix_acl_chmod_masq(acl, attr->ia_mode); - if (!error) { - posix_acl_to_xattr(acl, data, len); - error = gfs2_ea_acl_chmod(ip, &el, attr, data); - } + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); out: - posix_acl_release(acl); - brelse(el.el_bh); kfree(data); return error; } - diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 6751930bfb6..2d65ec4cd4b 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,26 +13,10 @@ #include "incore.h" #define GFS2_POSIX_ACL_ACCESS "posix_acl_access" -#define GFS2_POSIX_ACL_ACCESS_LEN 16 #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" -#define GFS2_POSIX_ACL_DEFAULT_LEN 17 +#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) -#define GFS2_ACL_IS_ACCESS(name, len) \ - ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \ - !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len))) - -#define GFS2_ACL_IS_DEFAULT(name, len) \ - ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \ - !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len))) - -struct gfs2_ea_request; - -int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode); -int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access); -int gfs2_check_acl(struct inode *inode, int mask); -int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); +extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c index ac772b6d9db..805b37fed63 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/aops.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -19,9 +19,9 @@ #include <linux/writeback.h> #include <linux/swap.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> #include <linux/backing-dev.h> -#include <linux/pagevec.h> +#include <linux/aio.h> +#include <trace/events/writeback.h> #include "gfs2.h" #include "incore.h" @@ -30,7 +30,6 @@ #include "inode.h" #include "log.h" #include "meta_io.h" -#include "ops_address.h" #include "quota.h" #include "trans.h" #include "rgrp.h" @@ -54,7 +53,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); } } @@ -104,17 +103,15 @@ static int gfs2_writepage_common(struct page *page, loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; - int ret = -EIO; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; - ret = 0; if (current->journal_info) goto redirty; /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); goto out; } return 1; @@ -126,51 +123,21 @@ out: } /** - * gfs2_writeback_writepage - Write page for writeback mappings + * gfs2_writepage - Write page for writeback mappings * @page: The page * @wbc: The writeback control * */ -static int gfs2_writeback_writepage(struct page *page, - struct writeback_control *wbc) -{ - int ret; - - ret = gfs2_writepage_common(page, wbc); - if (ret <= 0) - return ret; - - ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); - if (ret == -EAGAIN) - ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); - return ret; -} - -/** - * gfs2_ordered_writepage - Write page for ordered data files - * @page: The page to write - * @wbc: The writeback control - * - */ - -static int gfs2_ordered_writepage(struct page *page, - struct writeback_control *wbc) +static int gfs2_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); int ret; ret = gfs2_writepage_common(page, wbc); if (ret <= 0) return ret; - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } /** @@ -213,25 +180,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc { struct inode *inode = page->mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int ret; int done_trans = 0; - error = gfs2_writepage_common(page, wbc); - if (error <= 0) - return error; - if (PageChecked(page)) { if (wbc->sync_mode != WB_SYNC_ALL) goto out_ignore; - error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (error) + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) goto out_ignore; done_trans = 1; } - error = __gfs2_jdata_writepage(page, wbc); + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); if (done_trans) gfs2_trans_end(sdp); - return error; + return ret; out_ignore: redirty_page_for_writepage(wbc, page); @@ -240,16 +205,14 @@ out_ignore: } /** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. + * Used for both ordered and writeback modes. */ -static int gfs2_writeback_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); } @@ -268,61 +231,97 @@ static int gfs2_writeback_writepages(struct address_space *mapping, static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, - int nr_pages, pgoff_t end) + int nr_pages, pgoff_t end, + pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset = i_size & (PAGE_CACHE_SIZE-1); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); - struct backing_dev_info *bdi = mapping->backing_dev_info; int i; int ret; - ret = gfs2_trans_begin(sdp, nrblocks, 0); + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + ret = 1; + break; + } + + *done_index = page->index; + lock_page(page); if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - ret = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - /* Is the page fully outside i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - continue; - } + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = __gfs2_jdata_writepage(page, wbc); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + *done_index = page->index + 1; + ret = 1; + break; + } + } - if (ret || (--(wbc->nr_to_write) <= 0)) - ret = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; + break; } } @@ -345,61 +344,73 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, static int gfs2_write_cache_jdata(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - scanned = 1; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled && !done) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; + return ret; } @@ -420,7 +431,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, ret = gfs2_write_cache_jdata(mapping, wbc); if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); ret = gfs2_write_cache_jdata(mapping, wbc); } return ret; @@ -437,16 +448,18 @@ static int gfs2_jdata_writepages(struct address_space *mapping, static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) { struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); void *kaddr; int error; /* - * Due to the order of unstuffing files and ->nopage(), we can be + * Due to the order of unstuffing files and ->fault(), we can be * asked for a zero page in the case of a stuffed file being extended, * so we need to supply one here. It doesn't happen often. */ if (unlikely(page->index)) { zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); return 0; } @@ -454,11 +467,12 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) if (error) return error; - kaddr = kmap_atomic(page, KM_USER0); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); - kunmap_atomic(kaddr, KM_USER0); + kaddr = kmap_atomic(page); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); + kunmap_atomic(kaddr); flush_dcache_page(page); brelse(dibh); SetPageUptodate(page); @@ -502,46 +516,48 @@ static int __gfs2_readpage(void *file, struct page *page) * @file: The file to read * @page: The page of the file * - * This deals with the locking required. We use a trylock in order to - * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE - * in the event that we are unable to get the lock. + * This deals with the locking required. We have to unlock and + * relock the page in order to get the locking in the right + * order. */ static int gfs2_readpage(struct file *file, struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct address_space *mapping = page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_holder gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh); - error = gfs2_glock_nq_atime(&gh); - if (unlikely(error)) { - unlock_page(page); + unlock_page(page); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) goto out; - } - error = __gfs2_readpage(file, page); + error = AOP_TRUNCATED_PAGE; + lock_page(page); + if (page->mapping == mapping && !PageUptodate(page)) + error = __gfs2_readpage(file, page); + else + unlock_page(page); gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (error == GLR_TRYFAILED) { - yield(); - return AOP_TRUNCATED_PAGE; - } + if (error && error != AOP_TRUNCATED_PAGE) + lock_page(page); return error; } /** * gfs2_internal_read - read an internal file * @ip: The gfs2 inode - * @ra_state: The readahead state (or NULL for no readahead) * @buf: The buffer to fill * @pos: The file position * @size: The amount to read * */ -int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size) +int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + unsigned size) { struct address_space *mapping = ip->i_inode.i_mapping; unsigned long index = *pos / PAGE_CACHE_SIZE; @@ -558,10 +574,9 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, page = read_cache_page(mapping, index, __gfs2_readpage, NULL); if (IS_ERR(page)) return PTR_ERR(page); - p = kmap_atomic(page, KM_USER0); + p = kmap_atomic(page); memcpy(buf + copied, p + offset, amt); - kunmap_atomic(p, KM_USER0); - mark_page_accessed(page); + kunmap_atomic(p); page_cache_release(page); copied += amt; index++; @@ -594,8 +609,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_holder gh; int ret; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); if (unlikely(ret)) goto out_uninit; if (!gfs2_is_stuffed(ip)) @@ -627,38 +642,42 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - unsigned int data_blocks, ind_blocks, rblocks; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + unsigned requested = 0; int alloc_required; int error = 0; - struct gfs2_alloc *al; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; struct page *page; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); - error = gfs2_glock_nq_atime(&ip->i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); - if (error) - goto out_unlock; - - if (alloc_required) { - al = gfs2_alloc_get(ip); + alloc_required = gfs2_write_alloc_required(ip, pos, len); - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out_alloc_put; + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; + error = gfs2_quota_lock_check(ip); if (error) - goto out_qunlock; + goto out_unlock; - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; } @@ -668,6 +687,10 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(ip, requested); error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -675,7 +698,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, goto out_trans_fail; error = -ENOMEM; - page = __grab_cache_page(mapping, index); + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); *pagep = page; if (unlikely(!page)) goto out_endtrans; @@ -693,14 +717,19 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } prepare_write: - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, len, gfs2_block_map); out: if (error == 0) return 0; + unlock_page(page); page_cache_release(page); + + gfs2_trans_end(sdp); if (pos + len > ip->i_inode.i_size) - vmtruncate(&ip->i_inode, ip->i_inode.i_size); + gfs2_trim_blocks(&ip->i_inode); + goto out_trans_fail; + out_endtrans: gfs2_trans_end(sdp); out_trans_fail: @@ -708,10 +737,12 @@ out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); } out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); out_uninit: gfs2_holder_uninit(&ip->i_gh); @@ -725,14 +756,21 @@ out_uninit: static void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; u64 fs_total, new_free; /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); if (fs_total > (m_sc->sc_total + l_sc->sc_total)) new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); else @@ -741,6 +779,13 @@ static void adjust_fs_space(struct inode *inode) fs_warn(sdp, "File system extended by %llu blocks.\n", (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); } /** @@ -763,35 +808,40 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); memset(kaddr + pos + copied, 0, len - copied); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); page_cache_release(page); - if (inode->i_size < to) { - i_size_write(inode, to); - ip->i_di.di_size = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); + if (copied) { + if (inode->i_size < to) + i_size_write(inode, to); mark_inode_dirty(inode); } - if (inode == sdp->sd_rindex) + if (inode == sdp->sd_rindex) { adjust_fs_space(inode); + sdp->sd_rindex_uptodate = 0; + } brelse(dibh); gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return copied; @@ -821,14 +871,15 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_dinode *di; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!tr); - BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0); + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); ret = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(ret)) { @@ -837,8 +888,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, goto failed; } - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -846,24 +895,26 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + else + gfs2_trans_add_meta(ip->i_gl, dibh); - if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { - di = (struct gfs2_dinode *)dibh->b_data; - ip->i_di.di_size = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); - mark_inode_dirty(inode); - } - if (inode == sdp->sd_rindex) + if (inode == sdp->sd_rindex) { adjust_fs_space(inode); + sdp->sd_rindex_uptodate = 0; + } brelse(dibh); - gfs2_trans_end(sdp); failed: - if (al) { - gfs2_inplace_release(ip); + gfs2_trans_end(sdp); + gfs2_inplace_release(ip); + if (ip->i_res->rs_qa_qd_num) gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); @@ -919,8 +970,8 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) clear_buffer_dirty(bh); bd = bh->b_private; if (bd) { - if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); + if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_list); else gfs2_remove_from_journal(bh, current->journal_info, 0); } @@ -932,27 +983,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) unlock_buffer(bh); } -static void gfs2_invalidatepage(struct page *page, unsigned long offset) +static void gfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); struct buffer_head *bh, *head; unsigned long pos = 0; BUG_ON(!PageLocked(page)); - if (offset == 0) + if (!partial_page) ClearPageChecked(page); if (!page_has_buffers(page)) goto out; bh = head = page_buffers(page); do { + if (pos + bh->b_size > stop) + return; + if (offset <= pos) gfs2_discard(sdp, bh); pos += bh->b_size; bh = bh->b_this_page; } while (bh != head); out: - if (offset == 0) + if (!partial_page) try_to_release_page(page, 0); } @@ -975,7 +1032,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) if (gfs2_is_stuffed(ip)) return 0; - if (offset > i_size_read(&ip->i_inode)) + if (offset >= i_size_read(&ip->i_inode)) return 0; return 1; } @@ -983,11 +1040,11 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1000,19 +1057,49 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, * unfortunately have the option of only flushing a range like * the VFS does. */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); - rv = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, rw, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ - rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, - gfs2_get_block_direct, NULL); + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_iter_count(iter); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + goto out; + if (rw == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + gfs2_get_block_direct, NULL, NULL, 0); out: - gfs2_glock_dq_m(1, &gh); + gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); return rv; } @@ -1030,8 +1117,8 @@ out: int gfs2_releasepage(struct page *page, gfp_t gfp_mask) { - struct inode *aspace = page->mapping->host; - struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; + struct address_space *mapping = page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct buffer_head *bh, *head; struct gfs2_bufdata *bd; @@ -1039,55 +1126,52 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return 0; gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) goto cannot_release; bd = bh->b_private; - if (bd && bd->bd_ail) + if (bd && bd->bd_tr) goto cannot_release; - gfs2_assert_warn(sdp, !buffer_pinned(bh)); - gfs2_assert_warn(sdp, !buffer_dirty(bh)); + if (buffer_pinned(bh) || buffer_dirty(bh)) + goto not_possible; bh = bh->b_this_page; } while(bh != head); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { - gfs2_log_lock(sdp); bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); - if (!list_empty(&bd->bd_le.le_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) kmem_cache_free(gfs2_bufdata_cachep, bd); + } bh = bh->b_this_page; } while (bh != head); + gfs2_log_unlock(sdp); return try_to_free_buffers(page); + +not_possible: /* Should never happen */ + WARN_ON(buffer_dirty(bh)); + WARN_ON(buffer_pinned(bh)); cannot_release: + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } static const struct address_space_operations gfs2_writeback_aops = { - .writepage = gfs2_writeback_writepage, - .writepages = gfs2_writeback_writepages, + .writepage = gfs2_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .sync_page = block_sync_page, .write_begin = gfs2_write_begin, .write_end = gfs2_write_end, .bmap = gfs2_bmap, @@ -1095,13 +1179,15 @@ static const struct address_space_operations gfs2_writeback_aops = { .releasepage = gfs2_releasepage, .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_ordered_aops = { - .writepage = gfs2_ordered_writepage, + .writepage = gfs2_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .sync_page = block_sync_page, .write_begin = gfs2_write_begin, .write_end = gfs2_write_end, .set_page_dirty = gfs2_set_page_dirty, @@ -1110,6 +1196,8 @@ static const struct address_space_operations gfs2_ordered_aops = { .releasepage = gfs2_releasepage, .direct_IO = gfs2_direct_IO, .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -1117,13 +1205,14 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .sync_page = block_sync_page, .write_begin = gfs2_write_begin, .write_end = gfs2_write_end, .set_page_dirty = gfs2_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e9456ebd3bb..e6ee5b6e8d9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -7,13 +7,12 @@ * of the GNU General Public License version 2. */ -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" @@ -23,24 +22,22 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "log.h" +#include "super.h" #include "trans.h" #include "dir.h" #include "util.h" -#include "ops_address.h" +#include "trace_gfs2.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to * keep it small. */ struct metapath { + struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, - __be64 *bottom, unsigned int height, - void *data); - struct strip_mine { int sm_first; unsigned int sm_height; @@ -51,7 +48,7 @@ struct strip_mine { * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @private: any locked page held by the caller process + * @page: The (optional) page. This is looked up if @page is NULL * * Returns: errno */ @@ -64,7 +61,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, int release = 0; if (!page || page->index) { - page = grab_cache_page(inode->i_mapping, 0); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; release = 1; @@ -72,11 +69,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!PageUptodate(page)) { void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, - PAGE_CACHE_SIZE - ip->i_di.di_size); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap(page); SetPageUptodate(page); @@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!gfs2_is_jdata(ip)) mark_buffer_dirty(bh); if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); if (release) { unlock_page(page); @@ -108,8 +107,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, /** * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big * @ip: The GFS2 inode to unstuff - * @unstuffer: the routine that handles unstuffing a non-zero length file - * @private: private data for the unstuffer + * @page: The (optional) page. This is looked up if the @page is NULL * * This routine unstuffs a dinode and returns it to a "normal" state such * that the height can be grown in the traditional way. @@ -131,13 +129,16 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (error) goto out; - if (ip->i_di.di_size) { + if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + goto out_brelse; if (isdir) { - block = gfs2_alloc_meta(ip); - + gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); if (error) goto out_brelse; @@ -145,8 +146,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { - block = gfs2_alloc_data(ip); - error = gfs2_unstuffer_page(ip, dibh, block, page); if (error) goto out_brelse; @@ -155,18 +154,17 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) /* Set up the pointer to the new block */ - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - if (ip->i_di.di_size) { + if (i_size_read(&ip->i_inode)) { *(__be64 *)(di + 1) = cpu_to_be64(block); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); - di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); + gfs2_add_inode_blocks(&ip->i_inode, 1); + di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); } - ip->i_di.di_height = 1; + ip->i_height = 1; di->di_height = cpu_to_be16(1); out_brelse: @@ -176,114 +174,13 @@ out: return error; } -/** - * calc_tree_height - Calculate the height of a metadata tree - * @ip: The GFS2 inode - * @size: The proposed size of the file - * - * Work out how tall a metadata tree needs to be in order to accommodate a - * file of a particular size. If size is less than the current size of - * the inode, then the current size of the inode is used instead of the - * supplied one. - * - * Returns: the height the tree should be - */ - -static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 *arr; - unsigned int max, height; - - if (ip->i_di.di_size > size) - size = ip->i_di.di_size; - - if (gfs2_is_dir(ip)) { - arr = sdp->sd_jheightsize; - max = sdp->sd_max_jheight; - } else { - arr = sdp->sd_heightsize; - max = sdp->sd_max_height; - } - - for (height = 0; height < max; height++) - if (arr[height] >= size) - break; - - return height; -} - -/** - * build_height - Build a metadata tree of the requested height - * @ip: The GFS2 inode - * @height: The height to build to - * - * - * Returns: errno - */ - -static int build_height(struct inode *inode, unsigned height) -{ - struct gfs2_inode *ip = GFS2_I(inode); - unsigned new_height = height - ip->i_di.di_height; - struct buffer_head *dibh; - struct buffer_head *blocks[GFS2_MAX_META_HEIGHT]; - struct gfs2_dinode *di; - int error; - __be64 *bp; - u64 bn; - unsigned n; - - if (height <= ip->i_di.di_height) - return 0; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - for(n = 0; n < new_height; n++) { - bn = gfs2_alloc_meta(ip); - blocks[n] = gfs2_meta_new(ip->i_gl, bn); - gfs2_trans_add_bh(ip->i_gl, blocks[n], 1); - } - - n = 0; - bn = blocks[0]->b_blocknr; - if (new_height > 1) { - for(; n < new_height-1; n++) { - gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, - GFS2_FORMAT_IN); - gfs2_buffer_clear_tail(blocks[n], - sizeof(struct gfs2_meta_header)); - bp = (__be64 *)(blocks[n]->b_data + - sizeof(struct gfs2_meta_header)); - *bp = cpu_to_be64(blocks[n+1]->b_blocknr); - brelse(blocks[n]); - blocks[n] = NULL; - } - } - gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN); - gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header), - dibh, sizeof(struct gfs2_dinode)); - brelse(blocks[n]); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - *(__be64 *)(di + 1) = cpu_to_be64(bn); - ip->i_di.di_height += new_height; - ip->i_di.di_blocks += new_height; - gfs2_set_inode_blocks(&ip->i_inode); - di->di_height = cpu_to_be16(ip->i_di.di_height); - di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); - brelse(dibh); - return error; -} /** * find_metapath - Find path through the metadata tree - * @ip: The inode pointer + * @sdp: The superblock * @mp: The metapath to return the result in * @block: The disk block to look up + * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path * through the metadata of inode "ip" to get to block "block". @@ -338,21 +235,25 @@ static int build_height(struct inode *inode, unsigned height) * */ -static void find_metapath(struct gfs2_inode *ip, u64 block, - struct metapath *mp) +static void find_metapath(const struct gfs2_sbd *sdp, u64 block, + struct metapath *mp, unsigned int height) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 b = block; unsigned int i; - for (i = ip->i_di.di_height; i--;) - mp->mp_list[i] = do_div(b, sdp->sd_inptrs); + for (i = height; i--;) + mp->mp_list[i] = do_div(block, sdp->sd_inptrs); } +static inline unsigned int metapath_branch_start(const struct metapath *mp) +{ + if (mp->mp_list[0] == 0) + return 2; + return 1; +} + /** * metapointer - Return pointer to start of metadata in a buffer - * @bh: The buffer * @height: The metadata height (0 = dinode) * @mp: The metapath * @@ -361,93 +262,339 @@ static void find_metapath(struct gfs2_inode *ip, u64 block, * metadata tree. */ -static inline __be64 *metapointer(struct buffer_head *bh, int *boundary, - unsigned int height, const struct metapath *mp) +static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { + struct buffer_head *bh = mp->mp_bh[height]; unsigned int head_size = (height > 0) ? sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); - __be64 *ptr; - *boundary = 0; - ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; - if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size)) - *boundary = 1; - return ptr; + return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; +} + +static void gfs2_metapath_ra(struct gfs2_glock *gl, + const struct buffer_head *bh, const __be64 *pos) +{ + struct buffer_head *rabh; + const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size); + const __be64 *t; + + for (t = pos; t < endp; t++) { + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } } /** - * lookup_block - Get the next metadata block in metadata tree - * @ip: The GFS2 inode - * @bh: Buffer containing the pointers to metadata blocks - * @height: The height of the tree (0 = dinode) + * lookup_metapath - Walk the metadata tree to a specific point + * @ip: The inode * @mp: The metapath - * @create: Non-zero if we may create a new meatdata block - * @new: Used to indicate if we did create a new metadata block - * @block: the returned disk block number * - * Given a metatree, complete to a particular height, checks to see if the next - * height of the tree exists. If not the next height of the tree is created. - * The block number of the next height of the metadata tree is returned. + * Assumes that the inode's buffer has already been looked up and + * hooked onto mp->mp_bh[0] and that the metapath has been initialised + * by find_metapath(). * + * If this function encounters part of the tree which has not been + * allocated, it returns the current height of the tree at the point + * at which it found the unallocated block. Blocks which are found are + * added to the mp->mp_bh[] list. + * + * Returns: error or height of metadata tree */ -static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, - unsigned int height, struct metapath *mp, int create, - int *new, u64 *block) +static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { - int boundary; - __be64 *ptr = metapointer(bh, &boundary, height, mp); + unsigned int end_of_metadata = ip->i_height - 1; + unsigned int x; + __be64 *ptr; + u64 dblock; + int ret; - if (*ptr) { - *block = be64_to_cpu(*ptr); - return boundary; - } + for (x = 0; x < end_of_metadata; x++) { + ptr = metapointer(x, mp); + dblock = be64_to_cpu(*ptr); + if (!dblock) + return x + 1; - *block = 0; + ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); + if (ret) + return ret; + } - if (!create) - return 0; + return ip->i_height; +} - if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip)) - *block = gfs2_alloc_data(ip); - else - *block = gfs2_alloc_meta(ip); +static inline void release_metapath(struct metapath *mp) +{ + int i; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + } +} - *ptr = cpu_to_be64(*block); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); +/** + * gfs2_extent_length - Returns length of an extent of blocks + * @start: Start of the buffer + * @len: Length of the buffer in bytes + * @ptr: Current position in the buffer + * @limit: Max extent length to return (0 = unlimited) + * @eob: Set to 1 if we hit "end of block" + * + * If the first block is zero (unallocated) it will return the number of + * unallocated blocks in the extent, otherwise it will return the number + * of contiguous blocks in the extent. + * + * Returns: The length of the extent (minimum of one block) + */ - *new = 1; - return 0; +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +{ + const __be64 *end = (start + len); + const __be64 *first = ptr; + u64 d = be64_to_cpu(*ptr); + + *eob = 0; + do { + ptr++; + if (ptr >= end) + break; + if (limit && --limit == 0) + break; + if (d) + d++; + } while(be64_to_cpu(*ptr) == d); + if (ptr >= end) + *eob = 1; + return (ptr - first); } -static inline void bmap_lock(struct inode *inode, int create) +static inline void bmap_lock(struct gfs2_inode *ip, int create) { - struct gfs2_inode *ip = GFS2_I(inode); if (create) down_write(&ip->i_rw_mutex); else down_read(&ip->i_rw_mutex); } -static inline void bmap_unlock(struct inode *inode, int create) +static inline void bmap_unlock(struct gfs2_inode *ip, int create) { - struct gfs2_inode *ip = GFS2_I(inode); if (create) up_write(&ip->i_rw_mutex); else up_read(&ip->i_rw_mutex); } +static inline __be64 *gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) +{ + __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + + ((i > 1) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode))); + BUG_ON(i < 1); + BUG_ON(mp->mp_bh[i] != NULL); + mp->mp_bh[i] = gfs2_meta_new(gl, bn); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); + gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); + ptr += offset; + *ptr = cpu_to_be64(bn); + return ptr; +} + +enum alloc_state { + ALLOC_DATA = 0, + ALLOC_GROW_DEPTH = 1, + ALLOC_GROW_HEIGHT = 2, + /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ +}; + +/** + * gfs2_bmap_alloc - Build a metadata tree of the requested height + * @inode: The GFS2 inode + * @lblock: The logical starting block of the extent + * @bh_map: This is used to return the mapping details + * @mp: The metapath + * @sheight: The starting height (i.e. whats already mapped) + * @height: The height to build to + * @maxlen: The max number of data blocks to alloc + * + * In this routine we may have to alloc: + * i) Indirect blocks to grow the metadata tree height + * ii) Indirect blocks to fill in lower part of the metadata tree + * iii) Data blocks + * + * The function is in two parts. The first part works out the total + * number of blocks which we need. The second part does the actual + * allocation asking for an extent at a time (if enough contiguous free + * blocks are available, there will only be one request per bmap call) + * and uses the state machine to initialise the blocks in order. + * + * Returns: errno on error + */ + +static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, + struct buffer_head *bh_map, struct metapath *mp, + const unsigned int sheight, + const unsigned int height, + const unsigned int maxlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct super_block *sb = sdp->sd_vfs; + struct buffer_head *dibh = mp->mp_bh[0]; + u64 bn, dblock = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; + unsigned dblks = 0; + unsigned ptrs_per_blk; + const unsigned end_of_metadata = height - 1; + int ret; + int eob = 0; + enum alloc_state state; + __be64 *ptr; + __be64 zero_bn = 0; + + BUG_ON(sheight < 1); + BUG_ON(dibh == NULL); + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (height == sheight) { + struct buffer_head *bh; + /* Bottom indirect block exists, find unalloced extent size */ + ptr = metapointer(end_of_metadata, mp); + bh = mp->mp_bh[end_of_metadata]; + dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, + &eob); + BUG_ON(dblks < 1); + state = ALLOC_DATA; + } else { + /* Need to allocate indirect blocks */ + ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; + dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + if (height == ip->i_height) { + /* Writing into existing tree, extend tree down */ + iblks = height - sheight; + state = ALLOC_GROW_DEPTH; + } else { + /* Building up tree height */ + state = ALLOC_GROW_HEIGHT; + iblks = height - ip->i_height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); + } + } + + /* start of the second part of the function (state machine) */ + + blks = dblks + iblks; + i = sheight; + do { + int error; + n = blks - alloced; + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (error) + return error; + alloced += n; + if (state != ALLOC_DATA || gfs2_is_jdata(ip)) + gfs2_trans_add_unrevoke(sdp, bn, n); + switch (state) { + /* Growing height of tree */ + case ALLOC_GROW_HEIGHT: + if (i == 1) { + ptr = (__be64 *)(dibh->b_data + + sizeof(struct gfs2_dinode)); + zero_bn = *ptr; + } + for (; i - 1 < height - ip->i_height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); + if (i - 1 == height - ip->i_height) { + i--; + gfs2_buffer_copy_tail(mp->mp_bh[i], + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + gfs2_buffer_clear_tail(dibh, + sizeof(struct gfs2_dinode) + + sizeof(__be64)); + ptr = (__be64 *)(mp->mp_bh[i]->b_data + + sizeof(struct gfs2_meta_header)); + *ptr = zero_bn; + state = ALLOC_GROW_DEPTH; + for(i = branch_start; i < height; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } + i = branch_start; + } + if (n == 0) + break; + /* Branching from existing tree */ + case ALLOC_GROW_DEPTH: + if (i > 1 && i < height) + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); + for (; i < height && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, + mp->mp_list[i-1], bn++); + if (i == height) + state = ALLOC_DATA; + if (n == 0) + break; + /* Tree complete, adding data blocks */ + case ALLOC_DATA: + BUG_ON(n > dblks); + BUG_ON(mp->mp_bh[end_of_metadata] == NULL); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); + dblks = n; + ptr = metapointer(end_of_metadata, mp); + dblock = bn; + while (n-- > 0) + *ptr++ = cpu_to_be64(bn++); + if (buffer_zeronew(bh_map)) { + ret = sb_issue_zeroout(sb, dblock, dblks, + GFP_NOFS); + if (ret) { + fs_err(sdp, + "Failed to zero data buffers\n"); + clear_buffer_zeronew(bh_map); + } + } + break; + } + } while ((state != ALLOC_DATA) || !dblock); + + ip->i_height = height; + gfs2_add_inode_blocks(&ip->i_inode, alloced); + gfs2_dinode_out(ip, mp->mp_bh[0]->b_data); + map_bh(bh_map, inode->i_sb, dblock); + bh_map->b_size = dblks << inode->i_blkbits; + set_buffer_new(bh_map); + return 0; +} + /** * gfs2_block_map - Map a block from an inode to a disk block * @inode: The inode * @lblock: The logical block number * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request * - * Find the block number on the current device which corresponds to an - * inode's block. If the block had to be created, "new" will be set. + * Sets buffer_mapped() if successful, sets buffer_boundary() if a + * read of metadata will be required before the next block can be + * mapped. Sets buffer_new() if new blocks were allocated. * * Returns: errno */ @@ -457,97 +604,80 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *bh; - unsigned int bsize; - unsigned int height; - unsigned int end_of_metadata; - unsigned int x; - int error = 0; - int new = 0; - u64 dblock = 0; - int boundary; - unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; - struct metapath mp; + unsigned int bsize = sdp->sd_sb.sb_bsize; + const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const u64 *arr = sdp->sd_heightsize; + __be64 *ptr; u64 size; - struct buffer_head *dibh = NULL; + struct metapath mp; + int ret; + int eob; + unsigned int len; + struct buffer_head *bh; + u8 height; BUG_ON(maxlen == 0); - if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) - return 0; - - bmap_lock(inode, create); + memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + bmap_lock(ip, create); clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); - bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; - size = (lblock + 1) * bsize; - - if (size > ip->i_di.di_size) { - height = calc_tree_height(ip, size); - if (ip->i_di.di_height < height) { - if (!create) - goto out_ok; - - error = build_height(inode, height); - if (error) - goto out_fail; - } + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + if (gfs2_is_dir(ip)) { + bsize = sdp->sd_jbsize; + arr = sdp->sd_jheightsize; } - find_metapath(ip, lblock, &mp); - end_of_metadata = ip->i_di.di_height - 1; - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_fail; - dibh = bh; - get_bh(dibh); + ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]); + if (ret) + goto out; - for (x = 0; x < end_of_metadata; x++) { - lookup_block(ip, bh, x, &mp, create, &new, &dblock); - brelse(bh); - if (!dblock) - goto out_ok; + height = ip->i_height; + size = (lblock + 1) * bsize; + while (size > arr[height]) + height++; + find_metapath(sdp, lblock, &mp, height); + ret = 1; + if (height > ip->i_height || gfs2_is_stuffed(ip)) + goto do_alloc; + ret = lookup_metapath(ip, &mp); + if (ret < 0) + goto out; + if (ret != ip->i_height) + goto do_alloc; + ptr = metapointer(ip->i_height - 1, &mp); + if (*ptr == 0) + goto do_alloc; + map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr)); + bh = mp.mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob); + bh_map->b_size = (len << inode->i_blkbits); + if (eob) + set_buffer_boundary(bh_map); + ret = 0; +out: + release_metapath(&mp); + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + bmap_unlock(ip, create); + return ret; - error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh); - if (error) - goto out_fail; +do_alloc: + /* All allocations are done here, firstly check create flag */ + if (!create) { + BUG_ON(gfs2_is_stuffed(ip)); + ret = 0; + goto out; } - boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock); - if (dblock) { - map_bh(bh_map, inode->i_sb, dblock); - if (boundary) - set_buffer_boundary(bh_map); - if (new) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - set_buffer_new(bh_map); - goto out_brelse; - } - while(--maxlen && !buffer_boundary(bh_map)) { - u64 eblock; - - mp.mp_list[end_of_metadata]++; - boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock); - if (eblock != ++dblock) - break; - bh_map->b_size += (1 << inode->i_blkbits); - if (boundary) - set_buffer_boundary(bh_map); - } - } -out_brelse: - brelse(bh); -out_ok: - error = 0; -out_fail: - if (dibh) - brelse(dibh); - bmap_unlock(inode, create); - return error; + /* At this point ret is the tree depth of already allocated blocks */ + ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen); + goto out; } +/* + * Deprecated: do not use in new code + */ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) { struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; @@ -558,7 +688,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + 5); + bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; @@ -570,76 +700,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @bc: the call to make for each piece of metadata - * @data: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, block_call_t bc, - void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = bc(ip, dibh, bh, top, bottom, height, data); - if (error) - goto out; - - if (height < ip->i_di.di_height - 1) - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, bc, data); - if (error) - break; - } - -out: - brelse(bh); - return error; -} - -/** * do_strip - Look for a layer a particular layer of the file and strip it off * @ip: the inode * @dibh: the dinode buffer @@ -647,20 +707,19 @@ out: * @top: The first pointer in the buffer * @bottom: One more than the last pointer * @height: the height this buffer is at - * @data: a pointer to a struct strip_mine + * @sm: a pointer to a struct strip_mine * * Returns: errno */ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, void *data) + unsigned int height, struct strip_mine *sm) { - struct strip_mine *sm = data; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; - u32 blen; + u32 blen, btotal; __be64 *p; unsigned int rg_blocks = 0; int metadata; @@ -668,6 +727,10 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, int x; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (!*top) sm->sm_first = 0; @@ -679,13 +742,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, sm->sm_first = 0; } - metadata = (height != ip->i_di.di_height - 1); + metadata = (height != ip->i_height - 1); if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); - if (error) - return error; + else if (ip->i_depth) + revokes = sdp->sd_inptrs; memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); bstart = 0; @@ -701,7 +762,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; @@ -709,11 +770,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; /* Nothing to do */ - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -725,6 +786,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out_rlist; + if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ + gfs2_rs_deltree(ip->i_res); + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, revokes); @@ -733,11 +797,12 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, down_write(&ip->i_rw_mutex); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_trans_add_meta(ip->i_gl, bh); bstart = 0; blen = 0; + btotal = 0; for (p = top; p < bottom; p++) { if (!*p) @@ -749,10 +814,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, blen++; else { if (bstart) { - if (metadata) - gfs2_free_meta(ip, bstart, blen); - else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_blocks(ip, bstart, blen, metadata); + btotal += blen; } bstart = bn; @@ -760,18 +823,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } *p = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) { - if (metadata) - gfs2_free_meta(ip, bstart, blen); - else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_blocks(ip, bstart, blen, metadata); + btotal += blen; } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -785,86 +847,77 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } /** - * do_grow - Make a file look bigger than it is + * recursive_scan - recursively scan through the end of a file * @ip: the inode - * @size: the size to set the file to + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @sm: data opaque to this function to pass to @bc * - * Called with an exclusive lock on @ip. + * When this is first called @height and @block should be zero and + * @first should be 1. * * Returns: errno */ -static int do_grow(struct gfs2_inode *ip, u64 size) +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, struct strip_mine *sm) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; - struct buffer_head *dibh; - unsigned int h; + struct buffer_head *bh = NULL; + __be64 *top, *bottom; + u64 bn; int error; + int mh_size = sizeof(struct gfs2_meta_header); - al = gfs2_alloc_get(ip); + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out; + top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, &bh); + if (error) + return error; - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (error) - goto out_gunlock_q; + top = (__be64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); - al->al_requested = sdp->sd_max_height + RES_DATA; + bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } - error = gfs2_inplace_reserve(ip); + error = do_strip(ip, dibh, bh, top, bottom, height, sm); if (error) - goto out_gunlock_q; + goto out; - error = gfs2_trans_begin(sdp, - sdp->sd_max_height + al->al_rgd->rd_length + - RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); - if (error) - goto out_ipres; + if (height < ip->i_height - 1) { - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (error) - goto out_end_trans; - } + gfs2_metapath_ra(ip->i_gl, bh, top); + + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); - h = calc_tree_height(ip, size); - if (ip->i_di.di_height < h) { - down_write(&ip->i_rw_mutex); - error = build_height(&ip->i_inode, h); - up_write(&ip->i_rw_mutex); + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, sm); if (error) - goto out_end_trans; + break; } } - - ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - -out_end_trans: - gfs2_trans_end(sdp); -out_ipres: - gfs2_inplace_release(ip); -out_gunlock_q: - gfs2_quota_unlock(ip); out: - gfs2_alloc_put(ip); + brelse(bh); return error; } @@ -874,11 +927,10 @@ out: * * This is partly borrowed from ext3. */ -static int gfs2_block_truncate_page(struct address_space *mapping) +static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) { struct inode *inode = mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - loff_t from = inode->i_size; unsigned long index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize, iblock, length, pos; @@ -886,7 +938,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) struct page *page; int err; - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) return 0; @@ -930,25 +982,64 @@ static int gfs2_block_truncate_page(struct address_space *mapping) } if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); zero_user(page, offset, length); - + mark_buffer_dirty(bh); unlock: unlock_page(page); page_cache_release(page); return err; } -static int trunc_start(struct gfs2_inode *ip, u64 size) +#define GFS2_JTRUNC_REVOKES 8192 + +/** + * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files + * @inode: The inode being truncated + * @oldsize: The original (larger) size + * @newsize: The new smaller size + * + * With jdata files, we have to journal a revoke for each block which is + * truncated. As a result, we need to split this into separate transactions + * if the number of pages being truncated gets too large. + */ + +static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + u64 chunk; + int error; + + while (oldsize != newsize) { + chunk = oldsize - newsize; + if (chunk > max_chunk) + chunk = max_chunk; + truncate_pagecache(inode, oldsize - chunk); + oldsize -= chunk; + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + + return 0; +} + +static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; struct buffer_head *dibh; int journaled = gfs2_is_jdata(ip); int error; - error = gfs2_trans_begin(sdp, - RES_DINODE + (journaled ? RES_JDATA : 0), 0); + if (journaled) + error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; @@ -956,29 +1047,35 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) if (error) goto out; - if (gfs2_is_stuffed(ip)) { - ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); - error = 1; + gfs2_trans_add_meta(ip->i_gl, dibh); + if (gfs2_is_stuffed(ip)) { + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); } else { - if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) - error = gfs2_block_truncate_page(ip->i_inode.i_mapping); - - if (!error) { - ip->i_di.di_size = size; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); + if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { + error = gfs2_block_truncate_page(mapping, newsize); + if (error) + goto out_brelse; } + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; } - brelse(dibh); + i_size_write(inode, newsize); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + + if (journaled) + error = gfs2_journaled_truncate(inode, oldsize, newsize); + else + truncate_pagecache(inode, newsize); + if (error) { + brelse(dibh); + return error; + } + +out_brelse: + brelse(dibh); out: gfs2_trans_end(sdp); return error; @@ -986,7 +1083,8 @@ out: static int trunc_dealloc(struct gfs2_inode *ip, u64 size) { - unsigned int height = ip->i_di.di_height; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int height = ip->i_height; u64 lblock; struct metapath mp; int error; @@ -994,29 +1092,29 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) if (!size) lblock = 0; else - lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift; + lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; - find_metapath(ip, lblock, &mp); - gfs2_alloc_get(ip); + find_metapath(sdp, lblock, &mp, ip->i_height); + error = gfs2_rindex_update(sdp); + if (error) + return error; - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) - goto out; + return error; while (height--) { struct strip_mine sm; sm.sm_first = !!size; sm.sm_height = height; - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); if (error) break; } gfs2_quota_unhold(ip); -out: - gfs2_alloc_put(ip); return error; } @@ -1036,17 +1134,16 @@ static int trunc_end(struct gfs2_inode *ip) if (error) goto out; - if (!ip->i_di.di_size) { - ip->i_di.di_height = 0; - ip->i_di.di_goal_meta = - ip->i_di.di_goal_data = - ip->i_no_addr; + if (!i_size_read(&ip->i_inode)) { + ip->i_height = 0; + ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1058,92 +1155,168 @@ out: /** * do_shrink - make a file smaller - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * @inode: the inode + * @oldsize: the current inode size + * @newsize: the size to make the file * - * Called with an exclusive lock on @ip. + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. * * Returns: errno */ -static int do_shrink(struct gfs2_inode *ip, u64 size) +static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) { + struct gfs2_inode *ip = GFS2_I(inode); int error; - error = trunc_start(ip, size); + error = trunc_start(inode, oldsize, newsize); if (error < 0) return error; - if (error > 0) + if (gfs2_is_stuffed(ip)) return 0; - error = trunc_dealloc(ip, size); - if (!error) + error = trunc_dealloc(ip, newsize); + if (error == 0) error = trunc_end(ip); return error; } -static int do_touch(struct gfs2_inode *ip, u64 size) +void gfs2_trim_blocks(struct inode *inode) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 size = inode->i_size; + int ret; + + ret = do_shrink(inode, size, size); + WARN_ON(ret != 0); +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occurring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .target = 1, }; struct buffer_head *dibh; int error; + int unstuff = 0; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (gfs2_is_stuffed(ip) && + (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { + error = gfs2_quota_lock_check(ip); + if (error) + return error; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto do_grow_qunlock; + unstuff = 1; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? + 0 : RES_QUOTA), 0); if (error) - return error; + goto do_grow_release; - down_write(&ip->i_rw_mutex); + if (unstuff) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto do_end_trans; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - goto do_touch_out; + goto do_end_trans; + i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); -do_touch_out: - up_write(&ip->i_rw_mutex); +do_end_trans: gfs2_trans_end(sdp); +do_grow_release: + if (unstuff) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); + } return error; } /** - * gfs2_truncatei - make a file a given size - * @ip: the inode - * @size: the size to make the file - * @truncator: function to truncate the last partial block + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file * - * The file size can grow, shrink, or stay the same size. + * The file size can grow, shrink, or stay the same size. This + * is called holding i_mutex and an exclusive glock on the inode + * in question. * * Returns: errno */ -int gfs2_truncatei(struct gfs2_inode *ip, u64 size) +int gfs2_setattr_size(struct inode *inode, u64 newsize) { - int error; + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + u64 oldsize; - if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) - return -EINVAL; + BUG_ON(!S_ISREG(inode->i_mode)); - if (size > ip->i_di.di_size) - error = do_grow(ip, size); - else if (size < ip->i_di.di_size) - error = do_shrink(ip, size); - else - /* update time stamps */ - error = do_touch(ip, size); + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; - return error; + ret = get_write_access(inode); + if (ret) + return ret; + + inode_dio_wait(inode); + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out; + + oldsize = inode->i_size; + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } + + gfs2_rs_deltree(ip->i_res); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, ip->i_di.di_size); + error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); if (!error) error = trunc_end(ip); return error; @@ -1155,32 +1328,118 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) } /** - * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file - * @ip: the file - * @len: the number of bytes to be written to the file - * @data_blocks: returns the number of data blocks required - * @ind_blocks: returns the number of indirect blocks required + * gfs2_free_journal_extents - Free cached journal bmap info + * @jd: The journal * */ -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, unsigned int *ind_blocks) +void gfs2_free_journal_extents(struct gfs2_jdesc *jd) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int tmp; + struct gfs2_journal_extent *jext; - if (gfs2_is_dir(ip)) { - *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2; - *ind_blocks = 3 * (sdp->sd_max_jheight - 1); - } else { - *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; - *ind_blocks = 3 * (sdp->sd_max_height - 1); + while(!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list); + list_del(&jext->list); + kfree(jext); } +} - for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - *ind_blocks += tmp; +/** + * gfs2_add_jextent - Add or merge a new extent to extent cache + * @jd: The journal descriptor + * @lblock: The logical block at start of new extent + * @dblock: The physical block at start of new extent + * @blocks: Size of extent in fs blocks + * + * Returns: 0 on success or -ENOMEM + */ + +static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) +{ + struct gfs2_journal_extent *jext; + + if (!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list); + if ((jext->dblock + jext->blocks) == dblock) { + jext->blocks += blocks; + return 0; + } } + + jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); + if (jext == NULL) + return -ENOMEM; + jext->dblock = dblock; + jext->lblock = lblock; + jext->blocks = blocks; + list_add_tail(&jext->list, &jd->extent_list); + jd->nr_extents++; + return 0; +} + +/** + * gfs2_map_journal_extents - Cache journal bmap info + * @sdp: The super block + * @jd: The journal to map + * + * Create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * Returns: 0 on success, or error on failure + */ + +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + u64 lblock = 0; + u64 lblock_stop; + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct buffer_head bh; + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 size; + int rc; + + lblock_stop = i_size_read(jd->jd_inode) >> shift; + size = (lblock_stop - lblock) << shift; + jd->nr_extents = 0; + WARN_ON(!list_empty(&jd->extent_list)); + + do { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = size; + rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); + if (rc || !buffer_mapped(&bh)) + goto fail; + rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); + if (rc) + goto fail; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, + jd->nr_extents); + return 0; + +fail: + fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", + rc, jd->jd_jid, + (unsigned long long)(i_size_read(jd->jd_inode) - size), + jd->nr_extents); + fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", + rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, + bh.b_state, (unsigned long long)bh.b_size); + gfs2_free_journal_extents(jd); + return rc; } /** @@ -1188,21 +1447,18 @@ void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written - * @alloc_required: set to 1 if an alloc is required, 0 otherwise * - * Returns: errno + * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required) + unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u64 lblock, lblock_stop, dblock; - u32 extlen; - int new = 0; - int error = 0; - - *alloc_required = 0; + struct buffer_head bh; + unsigned int shift; + u64 lblock, lblock_stop, size; + u64 end_of_file; if (!len) return 0; @@ -1210,37 +1466,28 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, if (gfs2_is_stuffed(ip)) { if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) - *alloc_required = 1; + return 1; return 0; } - if (gfs2_is_dir(ip)) { - unsigned int bsize = sdp->sd_jbsize; - lblock = offset; - do_div(lblock, bsize); - lblock_stop = offset + len + bsize - 1; - do_div(lblock_stop, bsize); - } else { - unsigned int shift = sdp->sd_sb.sb_bsize_shift; - u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; - lblock = offset >> shift; - lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) { - *alloc_required = 1; - return 0; - } - } - - for (; lblock < lblock_stop; lblock += extlen) { - error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); - if (error) - return error; - - if (!dblock) { - *alloc_required = 1; - return 0; - } - } + shift = sdp->sd_sb.sb_bsize_shift; + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) + return 1; + + size = (lblock_stop - lblock) << shift; + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(&ip->i_inode, lblock, &bh, 0); + if (!buffer_mapped(&bh)) + return 1; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); return 0; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 4e6cde2943b..81ded5e2aaa 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -10,22 +10,52 @@ #ifndef __BMAP_DOT_H__ #define __BMAP_DOT_H__ +#include "inode.h" + struct inode; struct gfs2_inode; struct page; -int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); -int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); -int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); -int gfs2_truncatei(struct gfs2_inode *ip, u64 size); -int gfs2_truncatei_resume(struct gfs2_inode *ip); -int gfs2_file_dealloc(struct gfs2_inode *ip); +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, - unsigned int *ind_blocks); -int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required); +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, + u64 *dblock, unsigned *extlen); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern void gfs2_trim_blocks(struct inode *inode); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); +extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c deleted file mode 100644 index e51991947d2..00000000000 --- a/fs/gfs2/daemon.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <linux/freezer.h> - -#include "gfs2.h" -#include "incore.h" -#include "daemon.h" -#include "glock.h" -#include "log.h" -#include "quota.h" -#include "recovery.h" -#include "super.h" -#include "util.h" - -/* This uses schedule_timeout() instead of msleep() because it's good for - the daemons to wake up more often than the timeout when unmounting so - the user's unmount doesn't sit there forever. - - The kthread functions used to start these daemons block and flush signals. */ - -/** - * gfs2_glockd - Reclaim unused glock structures - * @sdp: Pointer to GFS2 superblock - * - * One or more of these daemons run, reclaiming glocks on sd_reclaim_list. - * Number of daemons can be set by user, with num_glockd mount option. - */ - -int gfs2_glockd(void *data) -{ - struct gfs2_sbd *sdp = data; - - while (!kthread_should_stop()) { - while (atomic_read(&sdp->sd_reclaim_count)) - gfs2_reclaim_glock(sdp); - - wait_event_interruptible(sdp->sd_reclaim_wq, - (atomic_read(&sdp->sd_reclaim_count) || - kthread_should_stop())); - if (freezing(current)) - refrigerator(); - } - - return 0; -} - -/** - * gfs2_recoverd - Recover dead machine's journals - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_recoverd(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - - while (!kthread_should_stop()) { - gfs2_check_journals(sdp); - t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - -/** - * gfs2_quotad - Write cached quota changes into the quota file - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_quotad(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - int error; - - while (!kthread_should_stop()) { - /* Update the master statfs file */ - - t = sdp->sd_statfs_sync_time + - gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_statfs_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (1) error=%d\n", error); - sdp->sd_statfs_sync_time = jiffies; - } - - /* Update quota file */ - - t = sdp->sd_quota_sync_time + - gfs2_tune_get(sdp, gt_quota_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_quota_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (2) error=%d\n", error); - sdp->sd_quota_sync_time = jiffies; - } - - gfs2_quota_scan(sdp); - - t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h deleted file mode 100644 index 4be084fb6a6..00000000000 --- a/fs/gfs2/daemon.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __DAEMON_DOT_H__ -#define __DAEMON_DOT_H__ - -int gfs2_glockd(void *data); -int gfs2_recoverd(void *data); -int gfs2_quotad(void *data); - -#endif /* __DAEMON_DOT_H__ */ diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c index 793e334d098..d3a5d4e29ba 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/dentry.c @@ -7,26 +7,25 @@ * of the GNU General Public License version 2. */ -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> +#include <linux/namei.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" #include "dir.h" #include "glock.h" -#include "ops_dentry.h" +#include "super.h" #include "util.h" #include "inode.h" /** * gfs2_drevalidate - Check directory lookup consistency * @dentry: the mapping to check - * @nd: + * @flags: lookup flags * * Check to make sure the lookup necessary to arrive at this inode from its * parent is still good. @@ -34,16 +33,24 @@ * Returns: 1 if the dentry is ok, 0 if it isn't */ -static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = dget_parent(dentry); - struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); - struct gfs2_inode *dip = GFS2_I(parent->d_inode); - struct inode *inode = dentry->d_inode; + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error; - int had_lock=0; + int had_lock = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(parent->d_inode); + dip = GFS2_I(parent->d_inode); + inode = dentry->d_inode; if (inode) { if (is_bad_inode(inode)) @@ -51,10 +58,10 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) ip = GFS2_I(inode); } - if (sdp->sd_args.ar_localcaching) + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto valid; - had_lock = gfs2_glock_is_locked_by_me(dip->i_gl); + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) @@ -86,12 +93,9 @@ invalid_gunlock: if (!had_lock) gfs2_glock_dq_uninit(&d_gh); invalid: - if (inode && S_ISDIR(inode->i_mode)) { - if (have_submounts(dentry)) - goto valid; - shrink_dcache_parent(dentry); - } - d_drop(dentry); + if (check_submounts_and_drop(dentry) != 0) + goto valid; + dput(parent); return 0; @@ -102,14 +106,32 @@ fail: return 0; } -static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) { str->hash = gfs2_disk_hash(str->name, str->len); return 0; } -struct dentry_operations gfs2_dops = { +static int gfs2_dentry_delete(const struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (!dentry->d_inode) + return 0; + + ginode = GFS2_I(dentry->d_inode); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + +const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c34709512b1..1a349f9a968 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -36,7 +36,7 @@ * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the * beginning of the leaf block. The dirents reside in leaves when * - * dip->i_di.di_flags & GFS2_DIF_EXHASH is true + * dip->i_diskflags & GFS2_DIF_EXHASH is true * * Otherwise, the dirents are "linear", within a single stuffed dinode block. * @@ -53,6 +53,8 @@ * but never before the maximum hash table size has been reached. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> @@ -60,7 +62,6 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/vmalloc.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" @@ -77,22 +78,24 @@ #define IS_LEAF 1 /* Hashed (leaf) directory */ #define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ + #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) -typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, - u64 leaf_no, void *data); +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, const struct qstr *name, void *opaque); - int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp) { struct buffer_head *bh; bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); *bhp = bh; @@ -126,10 +129,10 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); - if (ip->i_di.di_size < offset + size) - ip->i_di.di_size = offset + size; + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -159,6 +162,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, unsigned int o; int copied = 0; int error = 0; + int new = 0; if (!size) return 0; @@ -183,7 +187,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, while (copied < size) { unsigned int amount; struct buffer_head *bh; - int new = 0; amount = size - copied; if (amount > sdp->sd_sb.sb_bsize - o) @@ -208,7 +211,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, if (error) goto fail; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); memcpy(bh->b_data + o, buf, amount); brelse(bh); @@ -226,11 +229,11 @@ out: if (error) return error; - if (ip->i_di.di_size < offset + copied) - ip->i_di.di_size = offset + copied; + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -241,16 +244,15 @@ fail: return error; } -static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, - u64 offset, unsigned int size) +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct buffer_head *dibh; int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - offset += sizeof(struct gfs2_dinode); - memcpy(buf, dibh->b_data + offset, size); + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); } @@ -262,13 +264,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, * gfs2_dir_read_data - Read a data from a directory inode * @ip: The GFS2 Inode * @buf: The buffer to place result into - * @offset: File offset to begin jdata_readng from * @size: Amount of data to transfer * * Returns: The amount of data actually copied or the error */ -static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, - unsigned int size, unsigned ra) +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 lblock, dblock; @@ -277,22 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, int copied = 0; int error = 0; - if (offset >= ip->i_di.di_size) - return 0; - - if (offset + size > ip->i_di.di_size) - size = ip->i_di.di_size - offset; - - if (!size) - return 0; - if (gfs2_is_stuffed(ip)) - return gfs2_dir_read_stuffed(ip, buf, offset, size); + return gfs2_dir_read_stuffed(ip, buf, size); if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) return -EINVAL; - lblock = offset; + lblock = 0; o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); while (copied < size) { @@ -311,8 +303,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, if (error || !dblock) goto fail; BUG_ON(extlen < 1); - if (!ra) - extlen = 1; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); @@ -328,7 +318,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, extlen--; memcpy(buf, bh->b_data + o, amount); brelse(bh); - buf += amount; + buf += (amount/sizeof(__be64)); copied += amount; lblock++; o = sizeof(struct gfs2_meta_header); @@ -339,6 +329,79 @@ fail: return (copied) ? copied : error; } +/** + * gfs2_dir_get_hash_table - Get pointer to the dir hash table + * @ip: The inode in question + * + * Returns: The hash table or an error + */ + +static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + int ret; + u32 hsize; + __be64 *hc; + + BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); + + hc = ip->i_hash_cache; + if (hc) + return hc; + + hsize = 1 << ip->i_depth; + hsize *= sizeof(__be64); + if (hsize != i_size_read(&ip->i_inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + + if (hc == NULL) + return ERR_PTR(-ENOMEM); + + ret = gfs2_dir_read_data(ip, hc, hsize); + if (ret < 0) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { + ip->i_hash_cache = hc; + } + spin_unlock(&inode->i_lock); + + return ip->i_hash_cache; +} + +/** + * gfs2_dir_hash_inval - Invalidate dir hash + * @ip: The directory inode + * + * Must be called with an exclusive glock, or during glock invalidation. + */ +void gfs2_dir_hash_inval(struct gfs2_inode *ip) +{ + __be64 *hc = ip->i_hash_cache; + ip->i_hash_cache = NULL; + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); +} + static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) { return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; @@ -393,7 +456,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, unsigned totlen = be16_to_cpu(dent->de_rec_len); if (gfs2_dirent_sentinel(dent)) - actual = GFS2_DIRENT_SIZE(0); + actual = 0; if (totlen - actual >= required) return 1; return 0; @@ -446,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, goto error; return 0; error: - printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, - first ? "first in block" : "not first in block"); + pr_warn("%s: %s (%s)\n", + __func__, msg, first ? "first in block" : "not first in block"); return -EIO; } @@ -470,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf) } return offset; wrong_type: - printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", - be32_to_cpu(h->mh_type)); + pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type)); return -1; } @@ -526,38 +588,6 @@ consist_inode: return ERR_PTR(-EIO); } - -/** - * dirent_first - Return the first dirent - * @dip: the directory - * @bh: The buffer - * @dent: Pointer to list of dirents - * - * return first dirent whether bh points to leaf or stuffed dinode - * - * Returns: IS_LEAF, IS_DINODE, or -errno - */ - -static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh, - struct gfs2_dirent **dent) -{ - struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data; - - if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) { - if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh)) - return -EIO; - *dent = (struct gfs2_dirent *)(bh->b_data + - sizeof(struct gfs2_leaf)); - return IS_LEAF; - } else { - if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI)) - return -EIO; - *dent = (struct gfs2_dirent *)(bh->b_data + - sizeof(struct gfs2_dinode)); - return IS_DINODE; - } -} - static int dirent_check_reclen(struct gfs2_inode *dip, const struct gfs2_dirent *d, const void *end_p) { @@ -630,7 +660,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, return; } - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); /* If there is no prev entry, this is the first entry in the block. The de_rec_len is already as big as it needs to be. Just zero @@ -673,7 +703,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); totlen = be16_to_cpu(dent->de_rec_len); BUG_ON(offset + name->len > totlen); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ndent = (struct gfs2_dirent *)((char *)dent + offset); dent->de_rec_len = cpu_to_be16(offset); gfs2_qstr2dirent(name, totlen - offset, ndent); @@ -699,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { - /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; } @@ -718,17 +748,12 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { - __be64 leaf_no; - int error; - - error = gfs2_dir_read_data(dip, (char *)&leaf_no, - index * sizeof(__be64), - sizeof(__be64), 0); - if (error != sizeof(u64)) - return (error < 0) ? error : -EIO; - - *leaf_out = be64_to_cpu(leaf_no); + __be64 *hash; + hash = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hash)) + return PTR_ERR(hash); + *leaf_out = be64_to_cpu(*(hash + index)); return 0; } @@ -755,17 +780,17 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, struct gfs2_inode *ip = GFS2_I(inode); int error; - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_di.di_depth; + unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; - if (hsize * sizeof(u64) != ip->i_di.di_size) { + if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); } - index = name->hash >> (32 - ip->i_di.di_depth); + index = name->hash >> (32 - ip->i_depth); error = get_first_leaf(ip, index, &bh); if (error) return ERR_PTR(error); @@ -803,22 +828,35 @@ got_dent: static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) { struct gfs2_inode *ip = GFS2_I(inode); - u64 bn = gfs2_alloc_meta(ip); - struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); + unsigned int n = 1; + u64 bn; + int error; + struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; - struct qstr name = { .name = "", .len = 0, .hash = 0 }; + struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; + + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); if (!bh) return NULL; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); leaf = (struct gfs2_leaf *)bh->b_data; leaf->lf_depth = cpu_to_be16(depth); leaf->lf_entries = 0; leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); leaf->lf_next = 0; - memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); dent = (struct gfs2_dirent *)(leaf+1); gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); *pbh = bh; @@ -857,8 +895,8 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); - leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); + gfs2_assert(sdp, dip->i_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -896,7 +934,7 @@ static int dir_make_exhash(struct inode *inode) /* We're done with the new leaf block, now setup the new hash table. */ - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); @@ -904,13 +942,12 @@ static int dir_make_exhash(struct inode *inode) for (x = sdp->sd_hash_ptrs; x--; lp++) *lp = cpu_to_be64(bn); - dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; - dip->i_di.di_blocks++; - gfs2_set_inode_blocks(&dip->i_inode); - dip->i_di.di_flags |= GFS2_DIF_EXHASH; + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); + gfs2_add_inode_blocks(&dip->i_inode, 1); + dip->i_diskflags |= GFS2_DIF_EXHASH; for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; - dip->i_di.di_depth = y; + dip->i_depth = y; gfs2_dinode_out(dip, dibh->b_data); @@ -941,7 +978,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) int x, moved = 0; int error; - index = name->hash >> (32 - dip->i_di.di_depth); + index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); if (error) return error; @@ -952,12 +989,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) return error; oleaf = (struct gfs2_leaf *)obh->b_data; - if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) { + if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) { brelse(obh); return 1; /* can't split */ } - gfs2_trans_add_bh(dip->i_gl, obh, 1); + gfs2_trans_add_meta(dip->i_gl, obh); nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); if (!nleaf) { @@ -967,10 +1004,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); + len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { - printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index); + pr_warn("i_depth %u lf_depth %u index %u\n", + dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); gfs2_consist_inode(dip); error = -EIO; goto fail_brelse; @@ -981,11 +1019,18 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) /* Change the pointers. Don't bother distinguishing stuffed from non-stuffed. This code is complicated enough already. */ - lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + /* Change the pointers */ for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); + gfs2_dir_hash_inval(dip); + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), half_len * sizeof(u64)); if (error != half_len * sizeof(u64)) { @@ -997,10 +1042,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) kfree(lp); /* Compute the divider */ - divider = (start + half_len) << (32 - dip->i_di.di_depth); + divider = (start + half_len) << (32 - dip->i_depth); /* Copy the entries */ - dirent_first(dip, obh, &dent); + dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf)); do { next = dent; @@ -1021,13 +1066,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) new->de_inum = dent->de_inum; /* No endian worries */ new->de_type = dent->de_type; /* No endian worries */ - nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1); + be16_add_cpu(&nleaf->lf_entries, 1); dirent_del(dip, obh, prev, dent); if (!oleaf->lf_entries) gfs2_consist_inode(dip); - oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1); + be16_add_cpu(&oleaf->lf_entries, -1); if (!prev) prev = dent; @@ -1043,9 +1088,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { - gfs2_trans_add_bh(dip->i_gl, dibh, 1); - dip->i_di.di_blocks++; - gfs2_set_inode_blocks(&dip->i_inode); + gfs2_trans_add_meta(dip->i_gl, dibh); + gfs2_add_inode_blocks(&dip->i_inode, 1); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); } @@ -1073,67 +1117,61 @@ fail_brelse: static int dir_double_exhash(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *dibh; u32 hsize; - u64 *buf; - u64 *from, *to; - u64 block; + u32 hsize_bytes; + __be64 *hc; + __be64 *hc2, *h; int x; int error = 0; - hsize = 1 << dip->i_di.di_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { - gfs2_consist_inode(dip); - return -EIO; - } + hsize = 1 << dip->i_depth; + hsize_bytes = hsize * sizeof(__be64); - /* Allocate both the "from" and "to" buffers in one big chunk */ + hc = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hc)) + return PTR_ERR(hc); - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL); + hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); - for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { - error = gfs2_dir_read_data(dip, (char *)buf, - block * sdp->sd_hash_bsize, - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } - - from = buf; - to = (u64 *)((char *)buf + sdp->sd_hash_bsize); + if (!hc2) + return -ENOMEM; - for (x = sdp->sd_hash_ptrs; x--; from++) { - *to++ = *from; /* No endianess worries */ - *to++ = *from; - } + h = hc2; + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_kfree; - error = gfs2_dir_write_data(dip, - (char *)buf + sdp->sd_hash_bsize, - block * sdp->sd_sb.sb_bsize, - sdp->sd_sb.sb_bsize); - if (error != sdp->sd_sb.sb_bsize) { - if (error >= 0) - error = -EIO; - goto fail; - } + for (x = 0; x < hsize; x++) { + *h++ = *hc; + *h++ = *hc; + hc++; } - kfree(buf); + error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2); + if (error != (hsize_bytes * 2)) + goto fail; - error = gfs2_meta_inode_buffer(dip, &dibh); - if (!gfs2_assert_withdraw(sdp, !error)) { - dip->i_di.di_depth++; - gfs2_dinode_out(dip, dibh->b_data); - brelse(dibh); - } - - return error; + gfs2_dir_hash_inval(dip); + dip->i_hash_cache = hc2; + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + return 0; fail: - kfree(buf); + /* Replace original hash table & size */ + gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes); + i_size_write(&dip->i_inode, hsize_bytes); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); +out_kfree: + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1182,9 +1220,7 @@ static int compare_dents(const void *a, const void *b) /** * do_filldir_main - read out directory entries * @dip: The GFS2 inode - * @offset: The offset in the file to read from - * @opaque: opaque data to pass to filldir - * @filldir: The function to pass entries to + * @ctx: what to feed the entries to * @darr: an array of struct gfs2_dirent pointers to read * @entries: the number of entries in darr * @copied: pointer to int that's non-zero if a entry has been copied out @@ -1194,11 +1230,10 @@ static int compare_dents(const void *a, const void *b) * the possibility that they will fall into different readdir buffers or * that someone will want to seek to that location. * - * Returns: errno, >0 on exception from filldir + * Returns: errno, >0 if the actor tells you to stop */ -static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, - void *opaque, filldir_t filldir, +static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, const struct gfs2_dirent **darr, u32 entries, int *copied) { @@ -1206,7 +1241,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, u64 off, off_next; unsigned int x, y; int run = 0; - int error = 0; sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); @@ -1223,9 +1257,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, off_next = be32_to_cpu(dent_next->de_hash); off_next = gfs2_disk_hash2offset(off_next); - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; if (off_next == off) { if (*copied && !run) @@ -1234,32 +1268,50 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, } else run = 0; } else { - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; } - error = filldir(opaque, (const char *)(dent + 1), + if (!dir_emit(ctx, (const char *)(dent + 1), be16_to_cpu(dent->de_name_len), - off, be64_to_cpu(dent->de_inum.no_addr), - be16_to_cpu(dent->de_type)); - if (error) + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) return 1; *copied = 1; } - /* Increment the *offset by one, so the next time we come into the + /* Increment the ctx->pos by one, so the next time we come into the do_filldir fxn, we get the next entry instead of the last one in the current leaf */ - (*offset)++; + ctx->pos++; return 0; } -static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, int *copied, unsigned *depth, +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + +static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, + int *copied, unsigned *depth, u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1298,7 +1350,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1309,7 +1361,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1318,7 +1370,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1327,7 +1379,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1337,87 +1389,111 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, offset, opaque, filldir, darr, - entries, copied); -out_kfree: + error = do_filldir_main(ip, ctx, darr, entries, copied); +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } /** + * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. + */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count; + + /* First check if we've already read-ahead for the whole range. */ + if (index + MAX_RA_BLOCKS < f_ra->start) + return; + + f_ra->start = max((pgoff_t)index, f_ra->start); + for (count = 0; count < MAX_RA_BLOCKS; count++) { + if (f_ra->start >= hsize) /* if exceeded the hash table */ + break; + + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]); + f_ra->start++; + if (blocknr == last) + continue; + + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, bh); + continue; + } + brelse(bh); + } +} + +/** * dir_e_read - Reads the entries from a directory into a filldir buffer * @dip: dinode pointer - * @offset: the hash of the last entry read shifted to the right once - * @opaque: buffer for the filldir function to fill - * @filldir: points to the filldir function to use + * @ctx: actor to feed the entries to * * Returns: errno */ -static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) +static int dir_e_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); u32 hsize, len = 0; - u32 ht_offset, lp_offset, ht_offset_cur = -1; u32 hash, index; __be64 *lp; int copied = 0; int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_di.di_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { - gfs2_consist_inode(dip); - return -EIO; - } + hsize = 1 << dip->i_depth; + hash = gfs2_dir_offset2hash(ctx->pos); + index = hash >> (32 - dip->i_depth); - hash = gfs2_dir_offset2hash(*offset); - index = hash >> (32 - dip->i_di.di_depth); + if (dip->i_hash_cache == NULL) + f_ra->start = 0; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); - lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); - if (!lp) - return -ENOMEM; + gfs2_dir_readahead(inode, hsize, index, f_ra); while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - - error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + error = gfs2_dir_read_leaf(inode, ctx, &copied, &depth, - be64_to_cpu(lp[lp_offset])); + be64_to_cpu(lp[index])); if (error) break; - len = 1 << (dip->i_di.di_depth - depth); + len = 1 << (dip->i_depth - depth); index = (index & ~(len - 1)) + len; } -out: - kfree(lp); if (error > 0) error = 0; return error; } -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1427,11 +1503,11 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, int copied = 0; int error; - if (!dip->i_di.di_entries) + if (!dip->i_entries) return 0; - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) - return dir_e_read(inode, offset, opaque, filldir); + if (dip->i_diskflags & GFS2_DIF_EXHASH) + return dir_e_read(inode, ctx, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1444,7 +1520,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = -ENOMEM; /* 96 is max number of dirents which can be stuffed into an inode */ - darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL); + darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); if (darr) { g.pdent = darr; g.offset = 0; @@ -1454,17 +1530,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = PTR_ERR(dent); goto out; } - if (dip->i_di.di_entries != g.offset) { + if (dip->i_entries != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir %llu, " - "ip->i_di.di_entries (%u) != g.offset (%u)\n", + "ip->i_entries (%u) != g.offset (%u)\n", (unsigned long long)dip->i_no_addr, - dip->i_di.di_entries, + dip->i_entries, g.offset); error = -EIO; goto out; } - error = do_filldir_main(dip, offset, opaque, filldir, darr, - dip->i_di.di_entries, &copied); + error = do_filldir_main(dip, ctx, darr, + dip->i_entries, &copied); out: kfree(darr); } @@ -1479,9 +1555,9 @@ out: /** * gfs2_dir_search - Search a directory - * @dip: The GFS2 inode - * @filename: - * @inode: + * @dip: The GFS2 dir inode + * @name: The name we are looking up + * @fail_on_exist: Fail if the name exists rather than looking it up * * This routine searches a directory for a file or another directory. * Assumes a glock is held on dip. @@ -1489,22 +1565,25 @@ out: * Returns: errno */ -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, + bool fail_on_exist) { struct buffer_head *bh; struct gfs2_dirent *dent; - struct inode *inode; + u64 addr, formal_ino; + u16 dtype; dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { if (IS_ERR(dent)) return ERR_CAST(dent); - inode = gfs2_inode_lookup(dir->i_sb, - be16_to_cpu(dent->de_type), - be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + dtype = be16_to_cpu(dent->de_type); + addr = be64_to_cpu(dent->de_inum.no_addr); + formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); - return inode; + if (fail_on_exist) + return ERR_PTR(-EEXIST); + return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); } return ERR_PTR(-ENOENT); } @@ -1540,20 +1619,41 @@ out: return ret; } +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + static int dir_new_leaf(struct inode *inode, const struct qstr *name) { struct buffer_head *bh, *obh; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; int error; u32 index; u64 bn; - index = name->hash >> (32 - ip->i_di.di_depth); + index = name->hash >> (32 - ip->i_depth); error = get_first_leaf(ip, index, &obh); if (error) return error; do { + dist++; oleaf = (struct gfs2_leaf *)obh->b_data; bn = be64_to_cpu(oleaf->lf_next); if (!bn) @@ -1564,13 +1664,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return error; } while(1); - gfs2_trans_add_bh(ip->i_gl, obh, 1); + gfs2_trans_add_meta(ip->i_gl, obh); leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); if (!leaf) { brelse(obh); return -ENOSPC; } + leaf->lf_dist = cpu_to_be32(dist); oleaf->lf_next = cpu_to_be64(bh->b_blocknr); brelse(bh); brelse(obh); @@ -1578,59 +1679,78 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(ip, &bh); if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_add_inode_blocks(&ip->i_inode, 1); gfs2_dinode_out(ip, bh->b_data); brelse(bh); return 0; } +static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip) +{ + u64 where = ip->i_no_addr + 1; + if (ip->i_eattr == where) + return 1; + return 0; +} + /** * gfs2_dir_add - Add new filename into directory - * @dip: The GFS2 inode - * @filename: The new name - * @inode: The inode number of the entry - * @type: The type of the entry + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. * * Returns: 0 on success, error code on failure */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip, unsigned type) + const struct gfs2_inode *nip, struct gfs2_diradd *da) { struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - struct gfs2_dirent *dent; + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; struct gfs2_leaf *leaf; int error; while(1) { - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, - &bh); + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); - dent->de_type = cpu_to_be16(type); - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); + tv = CURRENT_TIME; + if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; - leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1); + be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } + da->dent = NULL; + da->bh = NULL; brelse(bh); - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - break; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_dinode_out(ip, bh->b_data); - brelse(bh); + ip->i_entries++; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; + if (S_ISDIR(nip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); + mark_inode_dirty(inode); error = 0; break; } - if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { error = dir_make_exhash(inode); if (error) break; @@ -1641,7 +1761,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, continue; if (error < 0) break; - if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { + if (ip->i_depth < GFS2_DIR_MAX_DEPTH) { error = dir_double_exhash(ip); if (error) break; @@ -1669,11 +1789,12 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, * Returns: 0 on success, error code on failure */ -int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) { + const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - int error; + struct timespec tv = CURRENT_TIME; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1693,29 +1814,26 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) } dirent_del(dip, bh, prev, dent); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); if (!entries) gfs2_consist_inode(dip); leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } brelse(bh); - error = gfs2_meta_inode_buffer(dip, &bh); - if (error) - return error; - - if (!dip->i_di.di_entries) + if (!dip->i_entries) gfs2_consist_inode(dip); - gfs2_trans_add_bh(dip->i_gl, bh, 1); - dip->i_di.di_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; - gfs2_dinode_out(dip, bh->b_data); - brelse(bh); + dip->i_entries--; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; + if (S_ISDIR(dentry->d_inode->i_mode)) + drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); - return error; + return 0; } /** @@ -1746,16 +1864,16 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, if (IS_ERR(dent)) return PTR_ERR(dent); - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { brelse(bh); error = gfs2_meta_inode_buffer(dip, &bh); if (error) return error; - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); } dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; @@ -1765,94 +1883,20 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, } /** - * foreach_leaf - call a function for each leaf in a directory - * @dip: the directory - * @lc: the function to call for each each - * @data: private data to pass to it - * - * Returns: errno - */ - -static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct buffer_head *bh; - struct gfs2_leaf *leaf; - u32 hsize, len; - u32 ht_offset, lp_offset, ht_offset_cur = -1; - u32 index = 0; - __be64 *lp; - u64 leaf_no; - int error = 0; - - hsize = 1 << dip->i_di.di_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { - gfs2_consist_inode(dip); - return -EIO; - } - - lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); - if (!lp) - return -ENOMEM; - - while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - - leaf_no = be64_to_cpu(lp[lp_offset]); - if (leaf_no) { - error = get_leaf(dip, leaf_no, &bh); - if (error) - goto out; - leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth)); - brelse(bh); - - error = lc(dip, index, len, leaf_no, data); - if (error) - goto out; - - index = (index & ~(len - 1)) + len; - } else - index++; - } - - if (index != hsize) { - gfs2_consist_inode(dip); - error = -EIO; - } - -out: - kfree(lp); - - return error; -} - -/** * leaf_dealloc - Deallocate a directory leaf * @dip: the directory * @index: the hash table offset in the directory * @len: the number of pointers to this leaf * @leaf_no: the leaf number - * @data: not used + * @leaf_bh: buffer_head for the starting leaf + * last_dealloc: 1 if this is the final dealloc for the leaf, else 0 * * Returns: errno */ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, - u64 leaf_no, void *data) + u64 leaf_no, struct buffer_head *leaf_bh, + int last_dealloc) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_leaf *tmp_leaf; @@ -1864,37 +1908,41 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, unsigned int x, size = len * sizeof(u64); int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - ht = kzalloc(size, GFP_KERNEL); + ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; - gfs2_alloc_get(dip); - - error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) goto out; - error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); - if (error) - goto out_qs; - /* Count the number of leaves */ + bh = leaf_bh; for (blk = leaf_no; blk; blk = nblk) { - error = get_leaf(dip, blk, &bh); - if (error) - goto out_rlist; + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + } tmp_leaf = (struct gfs2_leaf *)bh->b_data; nblk = be64_to_cpu(tmp_leaf->lf_next); - brelse(bh); + if (blk != leaf_no) + brelse(bh); - gfs2_rlist_add(sdp, &rlist, blk); + gfs2_rlist_add(dip, &rlist, blk); l_blocks++; } - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -1912,20 +1960,21 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_rg_gunlock; + bh = leaf_bh; + for (blk = leaf_no; blk; blk = nblk) { - error = get_leaf(dip, blk, &bh); - if (error) - goto out_end_trans; + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + } tmp_leaf = (struct gfs2_leaf *)bh->b_data; nblk = be64_to_cpu(tmp_leaf->lf_next); - brelse(bh); + if (blk != leaf_no) + brelse(bh); gfs2_free_meta(dip, blk, 1); - - if (!dip->i_di.di_blocks) - gfs2_consist_inode(dip); - dip->i_di.di_blocks--; - gfs2_set_inode_blocks(&dip->i_inode); + gfs2_add_inode_blocks(&dip->i_inode, -1); } error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); @@ -1939,7 +1988,11 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_end_trans; - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); + /* On the last dealloc, make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + if (last_dealloc) + dip->i_inode.i_mode = S_IFREG; gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1949,12 +2002,12 @@ out_rg_gunlock: gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); out_rlist: gfs2_rlist_free(&rlist); - gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); -out_qs: gfs2_quota_unhold(dip); out: - gfs2_alloc_put(dip); - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } @@ -1970,31 +2023,47 @@ out: int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *bh; - int error; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 index = 0, next_index; + __be64 *lp; + u64 leaf_no; + int error = 0, last; - /* Dealloc on-disk leaves to FREEMETA state */ - error = foreach_leaf(dip, leaf_dealloc, NULL); - if (error) - return error; + hsize = 1 << dip->i_depth; - /* Make this a regular file in case we crash. - (We don't want to free these blocks a second time.) */ + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; + while (index < hsize) { + leaf_no = be64_to_cpu(lp[index]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); - error = gfs2_meta_inode_buffer(dip, &bh); - if (!error) { - gfs2_trans_add_bh(dip->i_gl, bh, 1); - ((struct gfs2_dinode *)bh->b_data)->di_mode = - cpu_to_be32(S_IFREG); - brelse(bh); + next_index = (index & ~(len - 1)) + len; + last = ((next_index >= hsize) ? 1 : 0); + error = leaf_dealloc(dip, index, len, leaf_no, bh, + last); + brelse(bh); + if (error) + goto out; + index = next_index; + } else + index++; } - gfs2_trans_end(sdp); + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: return error; } @@ -2003,22 +2072,36 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) * gfs2_diradd_alloc_required - find if adding entry will require an allocation * @ip: the file being written to * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info * - * Returns: 1 if alloc required, 0 if not, -ve on error + * Returns: 0 if ok, -ve on error */ -int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) { + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); struct gfs2_dirent *dent; struct buffer_head *bh; + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); if (!dent) { - return 1; + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; } if (IS_ERR(dent)) return PTR_ERR(dent); - brelse(bh); + da->bh = bh; + da->dent = dent; return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 8a468cac932..126c65dda02 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -11,28 +11,47 @@ #define __DIR_DOT_H__ #include <linux/dcache.h> +#include <linux/crc32.h> struct inode; struct gfs2_inode; struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); -int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); -int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, unsigned int type); -int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir); -int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - const struct gfs2_inode *nip, unsigned int new_type); +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; +}; -int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename, + bool fail_on_exist); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); -int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); -int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, - struct buffer_head **bhp); +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename, + struct gfs2_diradd *da); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); +extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { @@ -60,4 +79,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct memcpy(dent + 1, name->name, name->len); } +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + #endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c deleted file mode 100644 index f114ba2b355..00000000000 --- a/fs/gfs2/eaops.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "eaops.h" -#include "eattr.h" -#include "util.h" - -/** - * gfs2_ea_name2type - get the type of the ea, and truncate type from the name - * @namep: ea name, possibly with type appended - * - * Returns: GFS2_EATYPE_XXX - */ - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) -{ - unsigned int type; - - if (strncmp(name, "system.", 7) == 0) { - type = GFS2_EATYPE_SYS; - if (truncated_name) - *truncated_name = name + sizeof("system.") - 1; - } else if (strncmp(name, "user.", 5) == 0) { - type = GFS2_EATYPE_USR; - if (truncated_name) - *truncated_name = name + sizeof("user.") - 1; - } else if (strncmp(name, "security.", 9) == 0) { - type = GFS2_EATYPE_SECURITY; - if (truncated_name) - *truncated_name = name + sizeof("security.") - 1; - } else { - type = GFS2_EATYPE_UNUSED; - if (truncated_name) - *truncated_name = NULL; - } - - return type; -} - -static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && - !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && - (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || - GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) - return -EOPNOTSUPP; - - return gfs2_ea_get_i(ip, er); -} - -static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - int remove = 0; - int error; - - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - if (!(er->er_flags & GFS2_ERF_MODE)) { - er->er_mode = ip->i_inode.i_mode; - er->er_flags |= GFS2_ERF_MODE; - } - error = gfs2_acl_validate_set(ip, 1, er, - &remove, &er->er_mode); - if (error) - return error; - error = gfs2_ea_set_i(ip, er); - if (error) - return error; - if (remove) - gfs2_ea_remove_i(ip, er); - return 0; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - error = gfs2_acl_validate_set(ip, 0, er, - &remove, NULL); - if (error) - return error; - if (!remove) - error = gfs2_ea_set_i(ip, er); - else { - error = gfs2_ea_remove_i(ip, er); - if (error == -ENODATA) - error = 0; - } - return error; - } - - return -EPERM; -} - -static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 1); - if (error) - return error; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 0); - if (error) - return error; - - } else - return -EPERM; - - return gfs2_ea_remove_i(ip, er); -} - -static const struct gfs2_eattr_operations gfs2_user_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "user", -}; - -const struct gfs2_eattr_operations gfs2_system_eaops = { - .eo_get = system_eo_get, - .eo_set = system_eo_set, - .eo_remove = system_eo_remove, - .eo_name = "system", -}; - -static const struct gfs2_eattr_operations gfs2_security_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "security", -}; - -const struct gfs2_eattr_operations *gfs2_ea_ops[] = { - NULL, - &gfs2_user_eaops, - &gfs2_system_eaops, - &gfs2_security_eaops, -}; - diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h deleted file mode 100644 index da2f7fbbb40..00000000000 --- a/fs/gfs2/eaops.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __EAOPS_DOT_H__ -#define __EAOPS_DOT_H__ - -struct gfs2_ea_request; -struct gfs2_inode; - -struct gfs2_eattr_operations { - int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - char *eo_name; -}; - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); - -extern const struct gfs2_eattr_operations gfs2_system_eaops; - -extern const struct gfs2_eattr_operations *gfs2_ea_ops[]; - -#endif /* __EAOPS_DOT_H__ */ - diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c index 334c7f85351..8b9b3775e2e 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/export.c @@ -7,14 +7,12 @@ * of the GNU General Public License version 2. */ -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" @@ -22,8 +20,7 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "ops_dentry.h" -#include "ops_fstype.h" +#include "super.h" #include "rgrp.h" #include "util.h" @@ -31,17 +28,20 @@ #define GFS2_LARGE_FH_SIZE 8 #define GFS2_OLD_FH_SIZE 10 -static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, - int connectable) +static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, + struct inode *parent) { __be32 *fh = (__force __be32 *)p; - struct inode *inode = dentry->d_inode; struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) - return 255; + if (parent && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; + return FILEID_INVALID; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return FILEID_INVALID; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); @@ -49,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; - if (!connectable || inode == sb->s_root->d_inode) + if (!parent || inode == sb->s_root->d_inode) return *len; - spin_lock(&dentry->d_lock); - inode = dentry->d_parent->d_inode; - ip = GFS2_I(inode); - igrab(inode); - spin_unlock(&dentry->d_lock); + ip = GFS2_I(parent); fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); @@ -64,12 +60,11 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_LARGE_FH_SIZE; - iput(inode); - return *len; } struct get_name_filldir { + struct dir_context ctx; struct gfs2_inum_host inum; char *name; }; @@ -94,10 +89,13 @@ static int gfs2_get_name(struct dentry *parent, char *name, struct inode *dir = parent->d_inode; struct inode *inode = child->d_inode; struct gfs2_inode *dip, *ip; - struct get_name_filldir gnfd; + struct get_name_filldir gnfd = { + .ctx.actor = get_name_filldir, + .name = name + }; struct gfs2_holder gh; - u64 offset = 0; int error; + struct file_ra_state f_ra = { .start = 0 }; if (!dir) return -EINVAL; @@ -111,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name, *name = 0; gnfd.inum.no_addr = ip->i_no_addr; gnfd.inum.no_formal_ino = ip->i_no_formal_ino; - gnfd.name = name; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); if (error) return error; - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); gfs2_glock_dq_uninit(&gh); @@ -129,45 +126,16 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - struct qstr dotdot; - struct inode *inode; - struct dentry *dentry; - - gfs2_str2qstr(&dotdot, ".."); - inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL); - - if (!inode) - return ERR_PTR(-ENOENT); - /* - * In case of an error, @inode carries the error value, and we - * have to return that as a(n invalid) pointer to dentry. - */ - if (IS_ERR(inode)) - return ERR_CAST(inode); - - dentry = d_alloc_anon(inode); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - - dentry->d_op = &gfs2_dops; - return dentry; + return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); } static struct dentry *gfs2_get_dentry(struct super_block *sb, - struct gfs2_inum_host *inum) + struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh, ri_gh, rgd_gh; - struct gfs2_rgrpd *rgd; struct inode *inode; - struct dentry *dentry; - int error; - /* System files? */ - - inode = gfs2_ilookup(sb, inum->no_addr); + inode = gfs2_ilookup(sb, inum->no_addr, 0); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { iput(inode); @@ -176,83 +144,13 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, goto out_inode; } - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; - - error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); - if (!rgd) - goto fail_rindex; - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); - if (error) - goto fail_rindex; - - error = -ESTALE; - if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) - goto fail_rgd; - - gfs2_glock_dq_uninit(&rgd_gh); - gfs2_glock_dq_uninit(&ri_gh); - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, - inum->no_addr, - 0, 0); - if (!inode) - goto fail; - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); out_inode: - dentry = d_alloc_anon(inode); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - - dentry->d_op = &gfs2_dops; - return dentry; - -fail_rgd: - gfs2_glock_dq_uninit(&rgd_gh); - -fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); - -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); + return d_obtain_alias(inode); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, @@ -265,6 +163,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, case GFS2_SMALL_FH_SIZE: case GFS2_LARGE_FH_SIZE: case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_SMALL_FH_SIZE) + return NULL; this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; this.no_formal_ino |= be32_to_cpu(fh[1]); this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; @@ -284,6 +184,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, switch (fh_type) { case GFS2_LARGE_FH_SIZE: case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_LARGE_FH_SIZE) + return NULL; parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; parent.no_formal_ino |= be32_to_cpu(fh[5]); parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c new file mode 100644 index 00000000000..26b3f952e6b --- /dev/null +++ b/fs/gfs2/file.c @@ -0,0 +1,1114 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/falloc.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include <linux/writeback.h> +#include <asm/uaccess.h> +#include <linux/dlm.h> +#include <linux/dlm_plock.h> +#include <linux/aio.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + switch (whence) { + case SEEK_END: /* These reference inode->i_size */ + case SEEK_DATA: + case SEEK_HOLE: + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = generic_file_llseek(file, offset, whence); + gfs2_glock_dq_uninit(&i_gh); + } + break; + case SEEK_CUR: + case SEEK_SET: + error = generic_file_llseek(file, offset, whence); + break; + default: + error = -EINVAL; + } + + return error; +} + +/** + * gfs2_readdir - Iterator for a directory + * @file: The directory to read from + * @ctx: What to feed directory entries to + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return error; + + error = gfs2_dir_read(dir, ctx, &file->f_ra); + + gfs2_glock_dq_uninit(&d_gh); + + return error; +} + +/** + * fsflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between fsflags values and + * GFS2's own flags values. + * + * Returns: the converted flags + */ +static u32 fsflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} + +static const u32 fsflags_to_gfs2[32] = { + [3] = GFS2_DIF_SYNC, + [4] = GFS2_DIF_IMMUTABLE, + [5] = GFS2_DIF_APPENDONLY, + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, + [17] = GFS2_DIF_TOPDIR, +}; + +static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_Sync] = FS_SYNC_FL, + [gfs2fl_Immutable] = FS_IMMUTABLE_FL, + [gfs2fl_AppendOnly] = FS_APPEND_FL, + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_TopLevel] = FS_TOPDIR_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = file_inode(filp); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (error) + return error; + + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) + fsflags |= FS_JOURNAL_DATA_FL; + if (put_user(fsflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); + return error; +} + +void gfs2_set_inode_flags(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; + + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) + flags |= S_NOATIME; + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_TOPDIR| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * do_gfs2_set_flags - set flags on an inode + * @filp: file pointer + * @reqflags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = file_inode(filp); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = mnt_want_write_file(filp); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_drop_write; + + error = -EACCES; + if (!inode_owner_or_capable(inode)) + goto out; + + error = 0; + flags = ip->i_diskflags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = gfs2_permission(inode, MAY_WRITE); + if (error) + goto out; + } + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + } + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_meta(ip->i_gl, bh); + ip->i_diskflags = new_flags; + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); +out_drop_write: + mnt_drop_write_file(filp); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = file_inode(filp); + u32 fsflags, gfsflags; + + if (get_user(fsflags, ptr)) + return -EFAULT; + + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); + if (!S_ISDIR(inode->i_mode)) { + gfsflags &= ~GFS2_DIF_TOPDIR; + if (gfsflags & GFS2_DIF_INHERIT_JDATA) + gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~0); + } + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FS_IOC_GETFLAGS: + return gfs2_get_flags(filp, (u32 __user *)arg); + case FS_IOC_SETFLAGS: + return gfs2_set_flags(filp, (u32 __user *)arg); + case FITRIM: + return gfs2_fitrim(filp, (void __user *)arg); + } + return -ENOTTY; +} + +/** + * gfs2_size_hint - Give a hint to the size of a write request + * @filep: The struct file + * @offset: The file offset of the write + * @size: The length of the write + * + * When we are about to do a write, this function records the total + * write size in order to provide a suitable hint to the lower layers + * about how many blocks will be required. + * + */ + +static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) +{ + struct inode *inode = file_inode(filep); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; + int hint = min_t(size_t, INT_MAX, blks); + + atomic_set(&ip->i_res->rs_sizehint, hint); +} + +/** + * gfs2_allocate_page_backing - Use bmap to allocate blocks + * @page: The (locked) page to allocate backing for + * + * We try to allocate all the blocks required for the page in + * one go. This might fail for various reasons, so we keep + * trying until all the blocks to back this page are allocated. + * If some of the blocks are already allocated, thats ok too. + */ + +static int gfs2_allocate_page_backing(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head bh; + unsigned long size = PAGE_CACHE_SIZE; + u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(inode, lblock, &bh, 1); + if (!buffer_mapped(&bh)) + return -EIO; + size -= bh.b_size; + lblock += (bh.b_size >> inode->i_blkbits); + } while(size > 0); + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vma: The virtual memory area + * @vmf: The virtual memory fault containing the page to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned long last_index; + u64 pos = page->index << PAGE_CACHE_SHIFT; + unsigned int data_blocks, ind_blocks, rblocks; + struct gfs2_holder gh; + loff_t size; + int ret; + + sb_start_pagefault(inode->i_sb); + + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + + ret = get_write_access(inode); + if (ret) + goto out; + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out_write_access; + + gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(page); + } + goto out_unlock; + } + + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_unlock; + + ret = gfs2_quota_lock_check(ip); + if (ret) + goto out_unlock; + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip, &ap); + if (ret) + goto out_quota_unlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) { + rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); + } + ret = gfs2_trans_begin(sdp, rblocks, 0); + if (ret) + goto out_trans_fail; + + lock_page(page); + ret = -EINVAL; + size = i_size_read(inode); + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + /* Check page index against inode size */ + if (size == 0 || (page->index > last_index)) + goto out_trans_end; + + ret = -EAGAIN; + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) + goto out_trans_end; + + /* Unstuff, if required, and allocate backing blocks for page */ + ret = 0; + if (gfs2_is_stuffed(ip)) + ret = gfs2_unstuff_dinode(ip, page); + if (ret == 0) + ret = gfs2_allocate_page_backing(page); + +out_trans_end: + if (ret) + unlock_page(page); + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (ret == 0) { + set_page_dirty(page); + wait_for_stable_page(page); + } +out_write_access: + put_write_access(inode); +out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); +} + +static const struct vm_operations_struct gfs2_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = gfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, +}; + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); + } + vma->vm_ops = &gfs2_vm_ops; + + return 0; +} + +/** + * gfs2_open_common - This is common to open and atomic_open + * @inode: The inode being opened + * @file: The file being opened + * + * This maybe called under a glock or not depending upon how it has + * been called. We must always be called under a glock for regular + * files, however. For other file types, it does not matter whether + * we hold the glock or not. + * + * Returns: Error code or 0 for success + */ + +int gfs2_open_common(struct inode *inode, struct file *file) +{ + struct gfs2_file *fp; + int ret; + + if (S_ISREG(inode->i_mode)) { + ret = generic_file_open(inode, file); + if (ret) + return ret; + } + + fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * After atomic_open, this function is only used for opening files + * which are already cached. We must still get the glock for regular + * files to ensure that we have the file size uptodate for the large + * file check which is in the common code. That is only an issue for + * regular files though. + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + bool need_unlock = false; + + if (S_ISREG(ip->i_inode.i_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + need_unlock = true; + } + + error = gfs2_open_common(inode, file); + + if (need_unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_release - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_release(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + kfree(file->private_data); + file->private_data = NULL; + + if (!(file->f_mode & FMODE_WRITE)) + return 0; + + gfs2_rs_delete(ip, &inode->i_writecount); + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry + * @start: the start position in the file to sync + * @end: the end position in the file to sync + * @datasync: set if we can ignore timestamp changes + * + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + int sync_state = inode->i_state & I_DIRTY; + struct gfs2_inode *ip = GFS2_I(inode); + int ret = 0, ret1 = 0; + + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } + + if (!gfs2_is_jdata(ip)) + sync_state &= ~I_DIRTY_PAGES; + if (datasync) + sync_state &= ~I_DIRTY_SYNC; + + if (sync_state) { + ret = sync_inode_metadata(inode, 1); + if (ret) + return ret; + if (gfs2_is_jdata(ip)) + filemap_write_and_wait(mapping); + gfs2_ail_flush(ip->i_gl, 1); + } + + if (mapping->nrpages) + ret = filemap_fdatawait_range(mapping, start, end); + + return ret ? ret : ret1; +} + +/** + * gfs2_file_write_iter - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file_inode(file)); + int ret; + + ret = gfs2_rs_alloc(ip); + if (ret) + return ret; + + gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); + + if (file->f_flags & O_APPEND) { + struct gfs2_holder gh; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_write_iter(iocb, from); +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + loff_t size = len; + unsigned int nr_blks; + sector_t lblock = offset >> inode->i_blkbits; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + return error; + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + while (len) { + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; + bh_map.b_size = len; + set_buffer_zeronew(&bh_map); + + error = gfs2_block_map(inode, lblock, &bh_map, 1); + if (unlikely(error)) + goto out; + len -= bh_map.b_size; + nr_blks = bh_map.b_size >> inode->i_blkbits; + lblock += nr_blks; + if (!buffer_new(&bh_map)) + continue; + if (unlikely(!buffer_zeronew(&bh_map))) { + error = -EIO; + goto out; + } + } + if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) + i_size_write(inode, offset + size); + + mark_inode_dirty(inode); + +out: + brelse(dibh); + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + int error; + const loff_t pos = offset; + const loff_t count = len; + loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; + struct gfs2_holder gh; + + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + offset &= bsize_mask; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + + error = gfs2_rs_alloc(ip); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) + goto out_uninit; + + gfs2_size_hint(file, offset, len); + + while (len > 0) { + if (len < bytes) + bytes = len; + if (!gfs2_write_alloc_required(ip, offset, bytes)) { + len -= bytes; + offset += bytes; + continue; + } + error = gfs2_quota_lock_check(ip); + if (error) + goto out_unlock; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + ap.target = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, &ap); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, + &max_bytes, &data_blocks, &ind_blocks); + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + } + + if (error == 0) + error = generic_write_sync(file, pos, count); + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + mutex_unlock(&inode->i_mutex); + return error; +} + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + +/** + * gfs2_setlease - acquire/release a file lease + * @file: the file pointer + * @arg: lease type + * @fl: file lock + * + * We don't currently have a way to enforce a lease across the whole + * cluster; until we do, disable leases (by just returning -EINVAL), + * unless the administrator has requested purely local locking. + * + * Locking: called under i_lock + * + * Returns: errno + */ + +static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) +{ + return -EINVAL; +} + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + if (cmd == F_CANCELLK) { + /* Hack: */ + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (fl->fl_type == F_UNLCK) + posix_lock_file_wait(file, fl); + return -EIO; + } + if (IS_GETLK(cmd)) + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file_inode(file)); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq(fl_gh); + gfs2_holder_reinit(state, flags, fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); + if (error) + goto out; + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + } + error = gfs2_glock_nq(fl_gh); + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) { + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_uninit(fl_gh); + } + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .setlease = gfs2_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops = { + .iterate = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .llseek = default_llseek, +}; + +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .setlease = generic_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .iterate = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .llseek = default_llseek, +}; + diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h index 3bb11c0f8b5..ef606e3a5cf 100644 --- a/fs/gfs2/gfs2.h +++ b/fs/gfs2/gfs2.h @@ -16,11 +16,6 @@ enum { }; enum { - NO_WAIT = 0, - WAIT = 1, -}; - -enum { NO_FORCE = 0, FORCE = 1, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7175a4d0643..ee4e04fe60f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1,16 +1,17 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> @@ -18,10 +19,8 @@ #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> #include <linux/list.h> -#include <linux/lm_interface.h> #include <linux/wait.h> #include <linux/module.h> -#include <linux/rwsem.h> #include <asm/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> @@ -29,127 +28,53 @@ #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/percpu.h> +#include <linux/list_sort.h> +#include <linux/lockref.h> #include "gfs2.h" #include "incore.h" #include "glock.h" #include "glops.h" #include "inode.h" -#include "lm.h" #include "lops.h" #include "meta_io.h" #include "quota.h" #include "super.h" #include "util.h" - -struct gfs2_gl_hash_bucket { - struct hlist_head hb_list; -}; - -struct glock_iter { - int hash; /* hash bucket index */ - struct gfs2_sbd *sdp; /* incore superblock */ - struct gfs2_glock *gl; /* current glock struct */ - struct seq_file *seq; /* sequence file for debugfs */ - char string[512]; /* scratch space */ +#include "bmap.h" +#define CREATE_TRACE_POINTS +#include "trace_gfs2.h" + +struct gfs2_glock_iter { + int hash; /* hash bucket index */ + unsigned nhash; /* Index within current bucket */ + struct gfs2_sbd *sdp; /* incore superblock */ + struct gfs2_glock *gl; /* current glock struct */ + loff_t last_pos; /* last position */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); -static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); -static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); -static void gfs2_glock_drop_th(struct gfs2_glock *gl); -static void run_queue(struct gfs2_glock *gl); +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); -static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; -static struct task_struct *scand_process; -static unsigned int scand_secs = 5; static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) #define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) -static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; +static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; static struct dentry *gfs2_root; -/* - * Despite what you might think, the numbers below are not arbitrary :-) - * They are taken from the ipv4 routing hash code, which is well tested - * and thus should be nearly optimal. Later on we might tweek the numbers - * but for now this should be fine. - * - * The reason for putting the locks in a separate array from the list heads - * is that we can have fewer locks than list heads and save memory. We use - * the same hash function for both, but with a different hash mask. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) - -#ifdef CONFIG_LOCKDEP -# define GL_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define GL_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define GL_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define GL_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define GL_HASH_LOCK_SZ 512 -# else -# define GL_HASH_LOCK_SZ 256 -# endif -#endif - -/* We never want more locks than chains */ -#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ -# undef GL_HASH_LOCK_SZ -# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE -#endif - -static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ]; - -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)]; -} -#else /* not SMP, so no spinlocks required */ -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return NULL; -} -#endif - -/** - * relaxed_state_ok - is a requested lock compatible with the current lock mode? - * @actual: the current state of the lock - * @requested: the lock state that was requested by the caller - * @flags: the modifier flags passed in by the caller - * - * Returns: 1 if the locks are compatible, 0 otherwise - */ - -static inline int relaxed_state_ok(unsigned int actual, unsigned requested, - int flags) -{ - if (actual == requested) - return 1; - - if (flags & GL_EXACT) - return 0; - - if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED) - return 1; - - if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY)) - return 1; - - return 0; -} - /** * gl_hash() - Turn glock number into hash bucket number * @lock: The glock number @@ -170,25 +95,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp, return h; } -/** - * glock_free() - Perform a few checks and then release struct gfs2_glock - * @gl: The glock to release - * - * Also calls lock module to release its internal structure for this glock. - * - */ +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&gl_hash_table[hash]); +} -static void glock_free(struct gfs2_glock *gl) +static inline void spin_unlock_bucket(unsigned int hash) { - struct gfs2_sbd *sdp = gl->gl_sbd; - struct inode *aspace = gl->gl_aspace; + hlist_bl_unlock(&gl_hash_table[hash]); +} - gfs2_lm_put_lock(sdp, gl->gl_lock); +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + } else { + kfree(gl->gl_lksb.sb_lvbptr); + kmem_cache_free(gfs2_glock_cachep, gl); + } +} - if (aspace) - gfs2_aspace_put(aspace); +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; - kmem_cache_free(gfs2_glock_cachep, gl); + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); } /** @@ -197,9 +132,61 @@ static void glock_free(struct gfs2_glock *gl) * */ -void gfs2_glock_hold(struct gfs2_glock *gl) +static void gfs2_glock_hold(struct gfs2_glock *gl) { - atomic_inc(&gl->gl_ref); + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + + +void gfs2_glock_add_to_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + + list_add_tail(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); + spin_unlock(&lru_lock); +} + +static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); + } +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); } /** @@ -208,27 +195,27 @@ void gfs2_glock_hold(struct gfs2_glock *gl) * */ -int gfs2_glock_put(struct gfs2_glock *gl) +void gfs2_glock_put(struct gfs2_glock *gl) { - int rv = 0; struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); - write_lock(gl_lock_addr(gl->gl_hash)); - if (atomic_dec_and_test(&gl->gl_ref)) { - hlist_del(&gl->gl_list); - write_unlock(gl_lock_addr(gl->gl_hash)); - gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); - gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); - gfs2_assert(sdp, list_empty(&gl->gl_holders)); - gfs2_assert(sdp, list_empty(&gl->gl_waiters1)); - gfs2_assert(sdp, list_empty(&gl->gl_waiters3)); - glock_free(gl); - rv = 1; - goto out; - } - write_unlock(gl_lock_addr(gl->gl_hash)); -out: - return rv; + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + lockref_mark_dead(&gl->gl_lockref); + + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); + spin_unlock(&gl->gl_lockref.lock); + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } /** @@ -244,836 +231,715 @@ static struct gfs2_glock *search_bucket(unsigned int hash, const struct lm_lockname *name) { struct gfs2_glock *gl; - struct hlist_node *h; + struct hlist_bl_node *h; - hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { + hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { if (!lm_name_equal(&gl->gl_name, name)) continue; if (gl->gl_sbd != sdp) continue; - - atomic_inc(&gl->gl_ref); - - return gl; + if (lockref_get_not_dead(&gl->gl_lockref)) + return gl; } return NULL; } /** - * gfs2_glock_find() - Find glock by lock number - * @sdp: The GFS2 superblock - * @name: The lock name + * may_grant - check if its ok to grant a new lock + * @gl: The glock + * @gh: The lock request which we wish to grant * - * Returns: NULL, or the struct gfs2_glock with the requested number + * Returns: true if its ok to grant the lock */ -static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp, - const struct lm_lockname *name) +static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) { - unsigned int hash = gl_hash(sdp, name); - struct gfs2_glock *gl; - - read_lock(gl_lock_addr(hash)); - gl = search_bucket(hash, sdp, name); - read_unlock(gl_lock_addr(hash)); - - return gl; + const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list); + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) + return 0; + if (gl->gl_state == gh->gh_state) + return 1; + if (gh->gh_flags & GL_EXACT) + return 0; + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) + return 1; + if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) + return 1; + } + if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) + return 1; + return 0; } -static void glock_work_func(struct work_struct *work) +static void gfs2_holder_wake(struct gfs2_holder *gh) { - struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); - - spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) - set_bit(GLF_DEMOTE, &gl->gl_flags); - run_queue(gl); - spin_unlock(&gl->gl_spin); - gfs2_glock_put(gl); + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb__after_atomic(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); } /** - * gfs2_glock_get() - Get a glock, or create one if one doesn't exist - * @sdp: The GFS2 superblock - * @number: the lock number - * @glops: The glock_operations to use - * @create: If 0, don't create the glock if it doesn't exist - * @glp: the glock is returned here - * - * This does not lock a glock, just finds/creates structures for one. + * do_error - Something unexpected has happened during a lock request * - * Returns: errno */ -int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, int create, - struct gfs2_glock **glp) +static inline void do_error(struct gfs2_glock *gl, const int ret) { - struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; - struct gfs2_glock *gl, *tmp; - unsigned int hash = gl_hash(sdp, &name); - int error; + struct gfs2_holder *gh, *tmp; - read_lock(gl_lock_addr(hash)); - gl = search_bucket(hash, sdp, &name); - read_unlock(gl_lock_addr(hash)); - - if (gl || !create) { - *glp = gl; - return 0; + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); } +} - gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); - if (!gl) - return -ENOMEM; +/** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock + * + * Returns: 1 if there is a blocked holder at the head of the list, or 2 + * if a type specific operation is underway. + */ - gl->gl_flags = 0; - gl->gl_name = name; - atomic_set(&gl->gl_ref, 1); - gl->gl_state = LM_ST_UNLOCKED; - gl->gl_demote_state = LM_ST_EXCLUSIVE; - gl->gl_hash = hash; - gl->gl_owner_pid = NULL; - gl->gl_ip = 0; - gl->gl_ops = glops; - gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - gl->gl_vn = 0; - gl->gl_stamp = jiffies; - gl->gl_tchange = jiffies; - gl->gl_object = NULL; - gl->gl_sbd = sdp; - gl->gl_aspace = NULL; - INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); +static int do_promote(struct gfs2_glock *gl) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh, *tmp; + int ret; - /* If this glock protects actual on-disk data or metadata blocks, - create a VFS inode to manage the pages/buffers holding them. */ - if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { - gl->gl_aspace = gfs2_aspace_get(sdp); - if (!gl->gl_aspace) { - error = -ENOMEM; - goto fail; +restart: + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (may_grant(gl, gh)) { + if (gh->gh_list.prev == &gl->gl_holders && + glops->go_lock) { + spin_unlock(&gl->gl_spin); + /* FIXME: eliminate this eventually */ + ret = glops->go_lock(gh); + spin_lock(&gl->gl_spin); + if (ret) { + if (ret == 1) + return 2; + gh->gh_error = ret; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 1); + gfs2_holder_wake(gh); + goto restart; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh, 0); + gfs2_holder_wake(gh); + continue; } + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + do_error(gl, 0); + break; } - - error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock); - if (error) - goto fail_aspace; - - write_lock(gl_lock_addr(hash)); - tmp = search_bucket(hash, sdp, &name); - if (tmp) { - write_unlock(gl_lock_addr(hash)); - glock_free(gl); - gl = tmp; - } else { - hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); - write_unlock(gl_lock_addr(hash)); - } - - *glp = gl; - return 0; - -fail_aspace: - if (gl->gl_aspace) - gfs2_aspace_put(gl->gl_aspace); -fail: - kmem_cache_free(gfs2_glock_cachep, gl); - return error; } /** - * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock - * @state: the state we're requesting - * @flags: the modifier flags - * @gh: the holder structure - * */ -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, - struct gfs2_holder *gh) +static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) { - INIT_LIST_HEAD(&gh->gh_list); - gh->gh_gl = gl; - gh->gh_ip = (unsigned long)__builtin_return_address(0); - gh->gh_owner_pid = get_pid(task_pid(current)); - gh->gh_state = state; - gh->gh_flags = flags; - gh->gh_error = 0; - gh->gh_iflags = 0; - gfs2_glock_hold(gl); -} - -/** - * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it - * @state: the state we're requesting - * @flags: the modifier flags - * @gh: the holder structure - * - * Don't mess with the glock. - * - */ + struct gfs2_holder *gh; -void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) -{ - gh->gh_state = state; - gh->gh_flags = flags; - gh->gh_iflags = 0; - gh->gh_ip = (unsigned long)__builtin_return_address(0); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; } /** - * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) - * @gh: the holder structure + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state * */ -void gfs2_holder_uninit(struct gfs2_holder *gh) +static void state_change(struct gfs2_glock *gl, unsigned int new_state) { - put_pid(gh->gh_owner_pid); - gfs2_glock_put(gh->gh_gl); - gh->gh_gl = NULL; - gh->gh_ip = 0; -} + int held1, held2; -static void gfs2_holder_wake(struct gfs2_holder *gh) -{ - clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); - wake_up_bit(&gh->gh_iflags, HIF_WAIT); -} + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); -static int just_schedule(void *word) -{ - schedule(); - return 0; -} + if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + if (held2) + gl->gl_lockref.count++; + else + gl->gl_lockref.count--; + } + if (held1 && held2 && list_empty(&gl->gl_holders)) + clear_bit(GLF_QUEUED, &gl->gl_flags); -static void wait_on_holder(struct gfs2_holder *gh) -{ - might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); + if (new_state != gl->gl_target) + /* shorten our minimum hold time */ + gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, + GL_GLOCK_MIN_HOLD); + gl->gl_state = new_state; + gl->gl_tchange = jiffies; } static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; - clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); - wake_up_bit(&gl->gl_flags, GLF_DEMOTE); -} - -static void wait_on_demote(struct gfs2_glock *gl) -{ - might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_atomic(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } /** - * rq_mutex - process a mutex request in the queue - * @gh: the glock holder + * finish_xmote - The DLM has replied to one of our lock requests + * @gl: The glock + * @ret: The status from the DLM * - * Returns: 1 if the queue is blocked */ -static int rq_mutex(struct gfs2_holder *gh) +static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) { - struct gfs2_glock *gl = gh->gh_gl; - - list_del_init(&gh->gh_list); - /* gh->gh_error never examined. */ - set_bit(GLF_LOCK, &gl->gl_flags); - clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb(); - wake_up_bit(&gh->gh_iflags, HIF_WAIT); - - return 1; -} - -/** - * rq_promote - process a promote request in the queue - * @gh: the glock holder - * - * Acquire a new inter-node lock, or change a lock state to more restrictive. - * - * Returns: 1 if the queue is blocked - */ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh; + unsigned state = ret & LM_OUT_ST_MASK; + int rv; -static int rq_promote(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; + spin_lock(&gl->gl_spin); + trace_gfs2_glock_state_change(gl, state); + state_change(gl, state); + gh = find_first_waiter(gl); + + /* Demote to UN request arrived during demote to SH or DF */ + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_target = LM_ST_UNLOCKED; + + /* Check for state != intended state */ + if (unlikely(state != gl->gl_target)) { + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + /* move to back of queue and try next entry */ + if (ret & LM_OUT_CANCELED) { + if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + goto retry; + } + /* Some error or failed "try lock" - report it */ + if ((ret & LM_OUT_ERROR) || + (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { + gl->gl_target = gl->gl_state; + do_error(gl, ret); + goto out; + } + } + switch(state) { + /* Unlocked due to conversion deadlock, try again */ + case LM_ST_UNLOCKED: +retry: + do_xmote(gl, gh, gl->gl_target); + break; + /* Conversion fails, unlock and try again */ + case LM_ST_SHARED: + case LM_ST_DEFERRED: + do_xmote(gl, gh, LM_ST_UNLOCKED); + break; + default: /* Everything else */ + pr_err("wanted %u got %u\n", gl->gl_target, state); + GLOCK_BUG_ON(gl, 1); + } + spin_unlock(&gl->gl_spin); + return; + } - if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { - if (list_empty(&gl->gl_holders)) { - gl->gl_req_gh = gh; - set_bit(GLF_LOCK, &gl->gl_flags); + /* Fast path - we got what we asked for */ + if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (state != LM_ST_UNLOCKED) { + if (glops->go_xmote_bh) { spin_unlock(&gl->gl_spin); - gfs2_glock_xmote_th(gh->gh_gl, gh); + rv = glops->go_xmote_bh(gl, gh); spin_lock(&gl->gl_spin); + if (rv) { + do_error(gl, rv); + goto out; + } } - return 1; - } - - if (list_empty(&gl->gl_holders)) { - set_bit(HIF_FIRST, &gh->gh_iflags); - set_bit(GLF_LOCK, &gl->gl_flags); - } else { - struct gfs2_holder *next_gh; - if (gh->gh_state == LM_ST_EXCLUSIVE) - return 1; - next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, - gh_list); - if (next_gh->gh_state == LM_ST_EXCLUSIVE) - return 1; + rv = do_promote(gl); + if (rv == 2) + goto out_locked; } - - list_move_tail(&gh->gh_list, &gl->gl_holders); - gh->gh_error = 0; - set_bit(HIF_HOLDER, &gh->gh_iflags); - - gfs2_holder_wake(gh); - - return 0; +out: + clear_bit(GLF_LOCK, &gl->gl_flags); +out_locked: + spin_unlock(&gl->gl_spin); } /** - * rq_demote - process a demote request in the queue - * @gh: the glock holder + * do_xmote - Calls the DLM to change the state of a lock + * @gl: The lock state + * @gh: The holder (only for promotes) + * @target: The target lock state * - * Returns: 1 if the queue is blocked */ -static int rq_demote(struct gfs2_glock *gl) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { - if (!list_empty(&gl->gl_holders)) - return 1; + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int lck_flags = gh ? gh->gh_flags : 0; + int ret; - if (gl->gl_state == gl->gl_demote_state || - gl->gl_state == LM_ST_UNLOCKED) { - gfs2_demote_wake(gl); - return 0; + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | + LM_FLAG_PRIORITY); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); + if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && + glops->go_inval) { + set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + do_error(gl, 0); /* Fail queued try locks */ } + gl->gl_req = target; + set_bit(GLF_BLOCKING, &gl->gl_flags); + if ((gl->gl_req == LM_ST_UNLOCKED) || + (gl->gl_state == LM_ST_EXCLUSIVE) || + (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) + clear_bit(GLF_BLOCKING, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + if (glops->go_sync) + glops->go_sync(gl); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); - set_bit(GLF_LOCK, &gl->gl_flags); - set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); - - if (gl->gl_demote_state == LM_ST_UNLOCKED || - gl->gl_state != LM_ST_EXCLUSIVE) { - spin_unlock(&gl->gl_spin); - gfs2_glock_drop_th(gl); - } else { - spin_unlock(&gl->gl_spin); - gfs2_glock_xmote_th(gl, NULL); + gfs2_glock_hold(gl); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + if (ret) { + pr_err("lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, 1); + } + } else { /* lock_nolock */ + finish_xmote(gl, target); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); } spin_lock(&gl->gl_spin); - clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); - - return 0; } /** - * run_queue - process holder structures on a glock + * find_first_holder - find the first "holder" gh * @gl: the glock - * */ -static void run_queue(struct gfs2_glock *gl) + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) { struct gfs2_holder *gh; - int blocked = 1; - for (;;) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) - break; - - if (!list_empty(&gl->gl_waiters1)) { - gh = list_entry(gl->gl_waiters1.next, - struct gfs2_holder, gh_list); - blocked = rq_mutex(gh); - } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { - blocked = rq_demote(gl); - if (gl->gl_waiters2 && !blocked) { - set_bit(GLF_DEMOTE, &gl->gl_flags); - gl->gl_demote_state = LM_ST_UNLOCKED; - } - gl->gl_waiters2 = 0; - } else if (!list_empty(&gl->gl_waiters3)) { - gh = list_entry(gl->gl_waiters3.next, - struct gfs2_holder, gh_list); - blocked = rq_promote(gh); - } else - break; - - if (blocked) - break; + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; } + return NULL; } /** - * gfs2_glmutex_lock - acquire a local lock on a glock - * @gl: the glock + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question + * @nonblock: True if we must not block in run_queue * - * Gives caller exclusive access to manipulate a glock structure. */ -static void gfs2_glmutex_lock(struct gfs2_glock *gl) +static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { - spin_lock(&gl->gl_spin); - if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - struct gfs2_holder gh; - - gfs2_holder_init(gl, 0, 0, &gh); - set_bit(HIF_WAIT, &gh.gh_iflags); - list_add_tail(&gh.gh_list, &gl->gl_waiters1); - spin_unlock(&gl->gl_spin); - wait_on_holder(&gh); - gfs2_holder_uninit(&gh); - } else { - gl->gl_owner_pid = get_pid(task_pid(current)); - gl->gl_ip = (unsigned long)__builtin_return_address(0); - spin_unlock(&gl->gl_spin); - } -} - -/** - * gfs2_glmutex_trylock - try to acquire a local lock on a glock - * @gl: the glock - * - * Returns: 1 if the glock is acquired - */ + struct gfs2_holder *gh = NULL; + int ret; -static int gfs2_glmutex_trylock(struct gfs2_glock *gl) -{ - int acquired = 1; + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + return; - spin_lock(&gl->gl_spin); - if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - acquired = 0; + GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + + if (test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_demote_state != gl->gl_state) { + if (find_first_holder(gl)) + goto out_unlock; + if (nonblock) + goto out_sched; + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); + gl->gl_target = gl->gl_demote_state; } else { - gl->gl_owner_pid = get_pid(task_pid(current)); - gl->gl_ip = (unsigned long)__builtin_return_address(0); + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + gfs2_demote_wake(gl); + ret = do_promote(gl); + if (ret == 0) + goto out_unlock; + if (ret == 2) + goto out; + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ } - spin_unlock(&gl->gl_spin); + do_xmote(gl, gh, gl->gl_target); +out: + return; - return acquired; -} +out_sched: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + return; -/** - * gfs2_glmutex_unlock - release a local lock on a glock - * @gl: the glock - * - */ +out_unlock: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + return; +} -static void gfs2_glmutex_unlock(struct gfs2_glock *gl) +static void delete_work_func(struct work_struct *work) { - struct pid *pid; + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip; + struct inode *inode; + u64 no_addr = gl->gl_name.ln_number; - spin_lock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); - pid = gl->gl_owner_pid; - gl->gl_owner_pid = NULL; - gl->gl_ip = 0; - run_queue(gl); - spin_unlock(&gl->gl_spin); + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - put_pid(pid); + if (ip) + inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); + } + gfs2_glock_put(gl); } -/** - * handle_callback - process a demote request - * @gl: the glock - * @state: the state the caller wants us to change to - * - * There are only two requests that we are going to see in actual - * practise: LM_ST_SHARED and LM_ST_UNLOCKED - */ - -static void handle_callback(struct gfs2_glock *gl, unsigned int state, - int remote, unsigned long delay) +static void glock_work_func(struct work_struct *work) { - int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; + unsigned long delay = 0; + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + int drop_ref = 0; + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { + finish_xmote(gl, gl->gl_reply); + drop_ref = 1; + } spin_lock(&gl->gl_spin); - set_bit(bit, &gl->gl_flags); - if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { - gl->gl_demote_state = state; - gl->gl_demote_time = jiffies; - if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && - gl->gl_object) { - gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); - return; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state != LM_ST_EXCLUSIVE) { + unsigned long holdtime, now = jiffies; + + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); } - } else if (gl->gl_demote_state != LM_ST_UNLOCKED && - gl->gl_demote_state != state) { - if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) - gl->gl_waiters2 = 1; - else - gl->gl_demote_state = LM_ST_UNLOCKED; } + run_queue(gl, 0); spin_unlock(&gl->gl_spin); + if (!delay) + gfs2_glock_put(gl); + else { + if (gl->gl_name.ln_type != LM_TYPE_INODE) + delay = 0; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); + } + if (drop_ref) + gfs2_glock_put(gl); } /** - * state_change - record that the glock is now in a different state - * @gl: the glock - * @new_state the new state + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. * + * Returns: errno */ -static void state_change(struct gfs2_glock *gl, unsigned int new_state) +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) { - int held1, held2; - - held1 = (gl->gl_state != LM_ST_UNLOCKED); - held2 = (new_state != LM_ST_UNLOCKED); - - if (held1 != held2) { - if (held2) - gfs2_glock_hold(gl); - else - gfs2_glock_put(gl); - } + struct super_block *s = sdp->sd_vfs; + struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; + struct gfs2_glock *gl, *tmp; + unsigned int hash = gl_hash(sdp, &name); + struct address_space *mapping; + struct kmem_cache *cachep; - gl->gl_state = new_state; - gl->gl_tchange = jiffies; -} + rcu_read_lock(); + gl = search_bucket(hash, sdp, &name); + rcu_read_unlock(); -/** - * xmote_bh - Called after the lock module is done acquiring a lock - * @gl: The glock in question - * @ret: the int returned from the lock module - * - */ + *glp = gl; + if (gl) + return 0; + if (!create) + return -ENOENT; -static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh = gl->gl_req_gh; - int prev_state = gl->gl_state; - int op_done = 1; - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); - - state_change(gl, ret & LM_OUT_ST_MASK); - - if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) { - if (glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - } else if (gl->gl_state == LM_ST_DEFERRED) { - /* We might not want to do this here. - Look at moving to the inode glops. */ - if (glops->go_inval) - glops->go_inval(gl, 0); - } + if (glops->go_flags & GLOF_ASPACE) + cachep = gfs2_glock_aspace_cachep; + else + cachep = gfs2_glock_cachep; + gl = kmem_cache_alloc(cachep, GFP_NOFS); + if (!gl) + return -ENOMEM; - /* Deal with each possible exit condition */ + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); - if (!gh) { - gl->gl_stamp = jiffies; - if (ret & LM_OUT_CANCELED) { - op_done = 0; - } else { - spin_lock(&gl->gl_spin); - if (gl->gl_state != gl->gl_demote_state) { - gl->gl_req_bh = NULL; - spin_unlock(&gl->gl_spin); - gfs2_glock_drop_th(gl); - gfs2_glock_put(gl); - return; - } - gfs2_demote_wake(gl); - spin_unlock(&gl->gl_spin); - } - } else { - spin_lock(&gl->gl_spin); - list_del_init(&gh->gh_list); - gh->gh_error = -EIO; - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - goto out; - gh->gh_error = GLR_CANCELED; - if (ret & LM_OUT_CANCELED) - goto out; - if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { - list_add_tail(&gh->gh_list, &gl->gl_holders); - gh->gh_error = 0; - set_bit(HIF_HOLDER, &gh->gh_iflags); - set_bit(HIF_FIRST, &gh->gh_iflags); - op_done = 0; - goto out; + if (glops->go_flags & GLOF_LVB) { + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); + if (!gl->gl_lksb.sb_lvbptr) { + kmem_cache_free(cachep, gl); + return -ENOMEM; } - gh->gh_error = GLR_TRYFAILED; - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) - goto out; - gh->gh_error = -EINVAL; - if (gfs2_assert_withdraw(sdp, 0) == -1) - fs_err(sdp, "ret = 0x%.8X\n", ret); -out: - spin_unlock(&gl->gl_spin); } - if (glops->go_xmote_bh) - glops->go_xmote_bh(gl); + atomic_inc(&sdp->sd_glock_disposal); + gl->gl_sbd = sdp; + gl->gl_flags = 0; + gl->gl_name = name; + gl->gl_lockref.count = 1; + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_target = LM_ST_UNLOCKED; + gl->gl_demote_state = LM_ST_EXCLUSIVE; + gl->gl_hash = hash; + gl->gl_ops = glops; + gl->gl_dstamp = ktime_set(0, 0); + preempt_disable(); + /* We use the global stats to estimate the initial per-glock stats */ + gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; + preempt_enable(); + gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; + gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; + gl->gl_tchange = jiffies; + gl->gl_object = NULL; + gl->gl_hold_time = GL_GLOCK_DFT_HOLD; + INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); + + mapping = gfs2_glock2aspace(gl); + if (mapping) { + mapping->a_ops = &gfs2_meta_aops; + mapping->host = s->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = s->s_bdi; + mapping->writeback_index = 0; + } - if (op_done) { - spin_lock(&gl->gl_spin); - gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); + spin_lock_bucket(hash); + tmp = search_bucket(hash, sdp, &name); + if (tmp) { + spin_unlock_bucket(hash); + kfree(gl->gl_lksb.sb_lvbptr); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); + gl = tmp; + } else { + hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); + spin_unlock_bucket(hash); } - gfs2_glock_put(gl); + *glp = gl; - if (gh) - gfs2_holder_wake(gh); + return 0; } /** - * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock - * @gl: The glock in question - * @state: the requested state - * @flags: modifier flags to the lock call + * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure * */ -static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = gl->gl_sbd; - int flags = gh ? gh->gh_flags : 0; - unsigned state = gh ? gh->gh_state : gl->gl_demote_state; - const struct gfs2_glock_operations *glops = gl->gl_ops; - int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | - LM_FLAG_NOEXP | LM_FLAG_ANY | - LM_FLAG_PRIORITY); - unsigned int lck_ret; - - if (glops->go_xmote_th) - glops->go_xmote_th(gl); - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); - gfs2_assert_warn(sdp, state != gl->gl_state); - + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gl; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_owner_pid = get_pid(task_pid(current)); + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_error = 0; + gh->gh_iflags = 0; gfs2_glock_hold(gl); - gl->gl_req_bh = xmote_bh; - - lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); - - if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR))) - return; - - if (lck_ret & LM_OUT_ASYNC) - gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC); - else - xmote_bh(gl, lck_ret); } /** - * drop_bh - Called after a lock module unlock completes - * @gl: the glock - * @ret: the return status + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure * - * Doesn't wake up the process waiting on the struct gfs2_holder (if any) - * Doesn't drop the reference on the glock the top half took out + * Don't mess with the glock. * */ -static void drop_bh(struct gfs2_glock *gl, unsigned int ret) +void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh = gl->gl_req_gh; - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, !ret); - - state_change(gl, LM_ST_UNLOCKED); - - if (glops->go_inval) - glops->go_inval(gl, DIO_METADATA); - - if (gh) { - spin_lock(&gl->gl_spin); - list_del_init(&gh->gh_list); - gh->gh_error = 0; - spin_unlock(&gl->gl_spin); - } - - spin_lock(&gl->gl_spin); - gfs2_demote_wake(gl); - gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - - gfs2_glock_put(gl); - - if (gh) - gfs2_holder_wake(gh); + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_iflags = 0; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + if (gh->gh_owner_pid) + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); } /** - * gfs2_glock_drop_th - call into the lock module to unlock a lock - * @gl: the glock + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure * */ -static void gfs2_glock_drop_th(struct gfs2_glock *gl) +void gfs2_holder_uninit(struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - unsigned int ret; - - if (glops->go_xmote_th) - glops->go_xmote_th(gl); - - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); - gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); - - gfs2_glock_hold(gl); - gl->gl_req_bh = drop_bh; - - ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); - - if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR))) - return; - - if (!ret) - drop_bh(gl, ret); - else - gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC); + put_pid(gh->gh_owner_pid); + gfs2_glock_put(gh->gh_gl); + gh->gh_gl = NULL; + gh->gh_ip = 0; } /** - * do_cancels - cancel requests for locks stuck waiting on an expire flag - * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock + * gfs2_glock_holder_wait + * @word: unused * - * Don't cancel GL_NOCANCEL requests. + * This function and gfs2_glock_demote_wait both show up in the WCHAN + * field. Thus I've separated these otherwise identical functions in + * order to be more informative to the user. */ -static void do_cancels(struct gfs2_holder *gh) +static int gfs2_glock_holder_wait(void *word) { - struct gfs2_glock *gl = gh->gh_gl; - - spin_lock(&gl->gl_spin); - - while (gl->gl_req_gh != gh && - !test_bit(HIF_HOLDER, &gh->gh_iflags) && - !list_empty(&gh->gh_list)) { - if (gl->gl_req_bh && !(gl->gl_req_gh && - (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { - spin_unlock(&gl->gl_spin); - gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock); - msleep(100); - spin_lock(&gl->gl_spin); - } else { - spin_unlock(&gl->gl_spin); - msleep(100); - spin_lock(&gl->gl_spin); - } - } + schedule(); + return 0; +} - spin_unlock(&gl->gl_spin); +static int gfs2_glock_demote_wait(void *word) +{ + schedule(); + return 0; } /** - * glock_wait_internal - wait on a glock acquisition + * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * * Returns: 0 on success */ -static int glock_wait_internal(struct gfs2_holder *gh) +int gfs2_glock_wait(struct gfs2_holder *gh) { - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - const struct gfs2_glock_operations *glops = gl->gl_ops; - - if (test_bit(HIF_ABORTED, &gh->gh_iflags)) - return -EIO; - - if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - spin_lock(&gl->gl_spin); - if (gl->gl_req_gh != gh && - !test_bit(HIF_HOLDER, &gh->gh_iflags) && - !list_empty(&gh->gh_list)) { - list_del_init(&gh->gh_list); - gh->gh_error = GLR_TRYFAILED; - run_queue(gl); - spin_unlock(&gl->gl_spin); - return gh->gh_error; - } - spin_unlock(&gl->gl_spin); - } - - if (gh->gh_flags & LM_FLAG_PRIORITY) - do_cancels(gh); - - wait_on_holder(gh); - if (gh->gh_error) - return gh->gh_error; - - gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags)); - gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state, - gh->gh_flags)); - - if (test_bit(HIF_FIRST, &gh->gh_iflags)) { - gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); - - if (glops->go_lock) { - gh->gh_error = glops->go_lock(gh); - if (gh->gh_error) { - spin_lock(&gl->gl_spin); - list_del_init(&gh->gh_list); - spin_unlock(&gl->gl_spin); - } - } - - spin_lock(&gl->gl_spin); - gl->gl_req_gh = NULL; - gl->gl_req_bh = NULL; - clear_bit(GLF_LOCK, &gl->gl_flags); - run_queue(gl); - spin_unlock(&gl->gl_spin); - } + unsigned long time1 = jiffies; + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ + /* Lengthen the minimum hold time. */ + gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + + GL_GLOCK_HOLD_INCR, + GL_GLOCK_MAX_HOLD); return gh->gh_error; } -static inline struct gfs2_holder * -find_holder_by_owner(struct list_head *head, struct pid *pid) +/** + * handle_callback - process a demote request + * @gl: the glock + * @state: the state the caller wants us to change to + * + * There are only two requests that we are going to see in actual + * practise: LM_ST_SHARED and LM_ST_UNLOCKED + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote) { - struct gfs2_holder *gh; + int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; - list_for_each_entry(gh, head, gh_list) { - if (gh->gh_owner_pid == pid) - return gh; + set_bit(bit, &gl->gl_flags); + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { + gl->gl_demote_state = state; + gl->gl_demote_time = jiffies; + } else if (gl->gl_demote_state != LM_ST_UNLOCKED && + gl->gl_demote_state != state) { + gl->gl_demote_state = LM_ST_UNLOCKED; } - - return NULL; + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); } -static void print_dbg(struct glock_iter *gi, const char *fmt, ...) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - if (gi) { - vsprintf(gi->string, fmt, args); - seq_printf(gi->seq, gi->string); + + if (seq) { + seq_vprintf(seq, fmt, args); + } else { + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("%pV", &vaf); } - else - vprintk(fmt, args); + va_end(args); } @@ -1081,50 +947,81 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...) * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add * + * Eventually we should move the recursive locking trap to a + * debugging option or something like that. This is the fast + * path and needs to have the minimum number of distractions. + * */ -static void add_to_queue(struct gfs2_holder *gh) +static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_holder *existing; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct list_head *insert_pt = NULL; + struct gfs2_holder *gh2; + int try_futile = 0; BUG_ON(gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) BUG(); - if (!(gh->gh_flags & GL_FLOCK)) { - existing = find_holder_by_owner(&gl->gl_holders, - gh->gh_owner_pid); - if (existing) { - print_symbol(KERN_WARNING "original: %s\n", - existing->gh_ip); - printk(KERN_INFO "pid : %d\n", - pid_nr(existing->gh_owner_pid)); - printk(KERN_INFO "lock type : %d lock state : %d\n", - existing->gh_gl->gl_name.ln_type, - existing->gh_gl->gl_state); - print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); - printk(KERN_INFO "pid : %d\n", - pid_nr(gh->gh_owner_pid)); - printk(KERN_INFO "lock type : %d lock state : %d\n", - gl->gl_name.ln_type, gl->gl_state); - BUG(); - } - - existing = find_holder_by_owner(&gl->gl_waiters3, - gh->gh_owner_pid); - if (existing) { - print_symbol(KERN_WARNING "original: %s\n", - existing->gh_ip); - print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); - BUG(); + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + try_futile = !may_grant(gl, gh); + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + goto trap_recursive; + if (try_futile && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { +fail: + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; } + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) + insert_pt = &gh2->gh_list; + } + set_bit(GLF_QUEUED, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 1); + gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); + if (likely(insert_pt == NULL)) { + list_add_tail(&gh->gh_list, &gl->gl_holders); + if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) + goto do_cancel; + return; + } + list_add_tail(&gh->gh_list, insert_pt); +do_cancel: + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { + spin_unlock(&gl->gl_spin); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_spin); } + return; - if (gh->gh_flags & LM_FLAG_PRIORITY) - list_add(&gh->gh_list, &gl->gl_waiters3); - else - list_add_tail(&gh->gh_list, &gl->gl_waiters3); +trap_recursive: + pr_err("original: %pSR\n", (void *)gh2->gh_ip); + pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", + gh2->gh_gl->gl_name.ln_type, gh2->gh_state); + pr_err("new: %pSR\n", (void *)gh->gh_ip); + pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", + gh->gh_gl->gl_name.ln_type, gh->gh_state); + gfs2_dump_glock(NULL, gl); + BUG(); } /** @@ -1142,24 +1039,26 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_sbd; int error = 0; -restart: - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { - set_bit(HIF_ABORTED, &gh->gh_iflags); + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) return -EIO; - } + + if (test_bit(GLF_LRU, &gl->gl_flags)) + gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_spin); add_to_queue(gh); - run_queue(gl); + if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + } + run_queue(gl, 1); spin_unlock(&gl->gl_spin); - if (!(gh->gh_flags & GL_ASYNC)) { - error = glock_wait_internal(gh); - if (error == GLR_CANCELED) { - msleep(100); - goto restart; - } - } + if (!(gh->gh_flags & GL_ASYNC)) + error = gfs2_glock_wait(gh); return error; } @@ -1173,48 +1072,7 @@ restart: int gfs2_glock_poll(struct gfs2_holder *gh) { - struct gfs2_glock *gl = gh->gh_gl; - int ready = 0; - - spin_lock(&gl->gl_spin); - - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - ready = 1; - else if (list_empty(&gh->gh_list)) { - if (gh->gh_error == GLR_CANCELED) { - spin_unlock(&gl->gl_spin); - msleep(100); - if (gfs2_glock_nq(gh)) - return 1; - return 0; - } else - ready = 1; - } - - spin_unlock(&gl->gl_spin); - - return ready; -} - -/** - * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC - * @gh: the holder structure - * - * Returns: 0, GLR_TRYFAILED, or errno on failure - */ - -int gfs2_glock_wait(struct gfs2_holder *gh) -{ - int error; - - error = glock_wait_internal(gh); - if (error == GLR_CANCELED) { - msleep(100); - gh->gh_flags &= ~GL_ASYNC; - error = gfs2_glock_nq(gh); - } - - return error; + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } /** @@ -1228,31 +1086,39 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned delay = 0; + int fast_path = 0; + spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); - gfs2_glmutex_lock(gl); - - spin_lock(&gl->gl_spin); list_del_init(&gh->gh_list); - - if (list_empty(&gl->gl_holders)) { + if (find_first_holder(gl) == NULL) { if (glops->go_unlock) { + GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); spin_unlock(&gl->gl_spin); glops->go_unlock(gh); spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); } - gl->gl_stamp = jiffies; + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; } + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + gfs2_glock_add_to_lru(gl); - clear_bit(GLF_LOCK, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); + if (likely(fast_path)) + return; gfs2_glock_hold(gl); if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - delay = gl->gl_ops->go_min_hold_time; + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } @@ -1261,7 +1127,8 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); - wait_on_demote(gl); + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); } /** @@ -1282,7 +1149,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) * @number: the lock number * @glops: the glock operations for the type of glock * @state: the state to acquire the glock in - * @flags: modifier flags for the aquisition + * @flags: modifier flags for the acquisition * @gh: the struct gfs2_holder * * Returns: errno @@ -1407,332 +1274,304 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { - unsigned int x; - - for (x = 0; x < num_gh; x++) - gfs2_glock_dq(&ghs[x]); + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); } -/** - * gfs2_glock_dq_uninit_m - release multiple glocks - * @num_gh: the number of structures - * @ghs: an array of struct gfs2_holder structures - * - */ - -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { - unsigned int x; + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; - for (x = 0; x < num_gh; x++) - gfs2_glock_dq_uninit(&ghs[x]); + gfs2_glock_hold(gl); + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (test_bit(GLF_QUEUED, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_hold_time; + } + + spin_lock(&gl->gl_spin); + handle_callback(gl, state, delay, true); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) + gfs2_glock_put(gl); } /** - * gfs2_lvb_hold - attach a LVB from a glock + * gfs2_should_freeze - Figure out if glock should be frozen * @gl: The glock in question * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise */ -int gfs2_lvb_hold(struct gfs2_glock *gl) +static int gfs2_should_freeze(const struct gfs2_glock *gl) { - int error; + const struct gfs2_holder *gh; - gfs2_glmutex_lock(gl); + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; - if (!atomic_read(&gl->gl_lvb_count)) { - error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); - if (error) { - gfs2_glmutex_unlock(gl); - return error; - } - gfs2_glock_hold(gl); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; } - atomic_inc(&gl->gl_lvb_count); - - gfs2_glmutex_unlock(gl); - return 0; + return 1; } /** - * gfs2_lvb_unhold - detach a LVB from a glock - * @gl: The glock in question + * gfs2_glock_complete - Callback used by locking + * @gl: Pointer to the glock + * @ret: The return value from the dlm * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. */ -void gfs2_lvb_unhold(struct gfs2_glock *gl) +void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { - gfs2_glock_hold(gl); - gfs2_glmutex_lock(gl); + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; - gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); - if (atomic_dec_and_test(&gl->gl_lvb_count)) { - gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); - gl->gl_lvb = NULL; - gfs2_glock_put(gl); + spin_lock(&gl->gl_spin); + gl->gl_reply = ret; + + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { + if (gfs2_should_freeze(gl)) { + set_bit(GLF_FROZEN, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + return; + } } - gfs2_glmutex_unlock(gl); - gfs2_glock_put(gl); + gl->gl_lockref.count++; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); } -static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, - unsigned int state) +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) { - struct gfs2_glock *gl; - unsigned long delay = 0; - unsigned long holdtime; - unsigned long now = jiffies; + struct gfs2_glock *gla, *glb; - gl = gfs2_glock_find(sdp, name); - if (!gl) - return; + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); - holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; - if (time_before(now, holdtime)) - delay = holdtime - now; + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; - handle_callback(gl, state, 1, delay); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) - gfs2_glock_put(gl); + return 0; } /** - * gfs2_glock_cb - Callback used by locking module - * @sdp: Pointer to the superblock - * @type: Type of callback - * @data: Type dependent data pointer + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). * - * Called by the locking module when it wants to tell us something. - * Either we need to drop a lock, one of our ASYNC requests completed, or - * a journal from another client needs to be recovered. + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) */ -void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) { - struct gfs2_sbd *sdp = cb_data; - - switch (type) { - case LM_CB_NEED_E: - blocking_cb(sdp, data, LM_ST_UNLOCKED); - return; - - case LM_CB_NEED_D: - blocking_cb(sdp, data, LM_ST_DEFERRED); - return; - - case LM_CB_NEED_S: - blocking_cb(sdp, data, LM_ST_SHARED); - return; + struct gfs2_glock *gl; - case LM_CB_ASYNC: { - struct lm_async_cb *async = data; - struct gfs2_glock *gl; + list_sort(NULL, list, glock_cmp); - down_read(&gfs2_umount_flush_sem); - gl = gfs2_glock_find(sdp, &async->lc_name); - if (gfs2_assert_warn(sdp, gl)) - return; - if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) - gl->gl_req_bh(gl, async->lc_ret); + while(!list_empty(list)) { + gl = list_entry(list->next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: + list_add(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + continue; + } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } + clear_bit(GLF_LRU, &gl->gl_flags); + gl->gl_lockref.count++; + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); - up_read(&gfs2_umount_flush_sem); - return; - } - - case LM_CB_NEED_RECOVERY: - gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data); - if (sdp->sd_recoverd_process) - wake_up_process(sdp->sd_recoverd_process); - return; - - case LM_CB_DROPLOCKS: - gfs2_gl_hash_clear(sdp, NO_WAIT); - gfs2_quota_scan(sdp); - return; - - default: - gfs2_assert_warn(sdp, 0); - return; + gl->gl_lockref.count--; + spin_unlock(&gl->gl_spin); + cond_resched_lock(&lru_lock); } } /** - * demote_ok - Check to see if it's ok to unlock a glock - * @gl: the glock + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan * - * Returns: 1 if it's ok + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. */ -static int demote_ok(struct gfs2_glock *gl) +static long gfs2_scan_glock_lru(int nr) { - const struct gfs2_glock_operations *glops = gl->gl_ops; - int demote = 1; + struct gfs2_glock *gl; + LIST_HEAD(skipped); + LIST_HEAD(dispose); + long freed = 0; + + spin_lock(&lru_lock); + while ((nr-- >= 0) && !list_empty(&lru_list)) { + gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + + /* Test for being demotable */ + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + freed++; + continue; + } - if (test_bit(GLF_STICKY, &gl->gl_flags)) - demote = 0; - else if (glops->go_demote_ok) - demote = glops->go_demote_ok(gl); + list_move(&gl->gl_lru, &skipped); + } + list_splice(&skipped, &lru_list); + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); + spin_unlock(&lru_lock); - return demote; + return freed; } -/** - * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list - * @gl: the glock - * - */ - -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { - struct gfs2_sbd *sdp = gl->gl_sbd; - - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&gl->gl_reclaim)) { - gfs2_glock_hold(gl); - list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); - atomic_inc(&sdp->sd_reclaim_count); - } - spin_unlock(&sdp->sd_reclaim_lock); + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + return gfs2_scan_glock_lru(sc->nr_to_scan); +} - wake_up(&sdp->sd_reclaim_wq); +static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&lru_count)); } +static struct shrinker glock_shrinker = { + .seeks = DEFAULT_SEEKS, + .count_objects = gfs2_glock_shrink_count, + .scan_objects = gfs2_glock_shrink_scan, +}; + /** - * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list + * examine_bucket - Call a function for glock in a hash bucket + * @examiner: the function * @sdp: the filesystem - * - * Called from gfs2_glockd() glock reclaim daemon, or when promoting a - * different glock and we notice that there are a lot of glocks in the - * reclaim list. + * @bucket: the bucket * */ -void gfs2_reclaim_glock(struct gfs2_sbd *sdp) +static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, + unsigned int hash) { struct gfs2_glock *gl; + struct hlist_bl_head *head = &gl_hash_table[hash]; + struct hlist_bl_node *pos; - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&sdp->sd_reclaim_list)) { - spin_unlock(&sdp->sd_reclaim_lock); - return; + rcu_read_lock(); + hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { + if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) + examiner(gl); } - gl = list_entry(sdp->sd_reclaim_list.next, - struct gfs2_glock, gl_reclaim); - list_del_init(&gl->gl_reclaim); - spin_unlock(&sdp->sd_reclaim_lock); - - atomic_dec(&sdp->sd_reclaim_count); - atomic_inc(&sdp->sd_reclaimed); + rcu_read_unlock(); + cond_resched(); +} - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - gfs2_glmutex_unlock(gl); - } +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + unsigned x; - gfs2_glock_put(gl); + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(examiner, sdp, x); } + /** - * examine_bucket - Call a function for glock in a hash bucket - * @examiner: the function - * @sdp: the filesystem - * @bucket: the bucket + * thaw_glock - thaw out a glock which has an unprocessed reply waiting + * @gl: The glock to thaw * - * Returns: 1 if the bucket has entries */ -static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, - unsigned int hash) +static void thaw_glock(struct gfs2_glock *gl) { - struct gfs2_glock *gl, *prev = NULL; - int has_entries = 0; - struct hlist_head *head = &gl_hash_table[hash].hb_list; - - read_lock(gl_lock_addr(hash)); - /* Can't use hlist_for_each_entry - don't want prefetch here */ - if (hlist_empty(head)) + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) goto out; - gl = list_entry(head->first, struct gfs2_glock, gl_list); - while(1) { - if (!sdp || gl->gl_sbd == sdp) { - gfs2_glock_hold(gl); - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); - prev = gl; - examiner(gl); - has_entries = 1; - read_lock(gl_lock_addr(hash)); - } - if (gl->gl_list.next == NULL) - break; - gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list); - } + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { out: - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); - cond_resched(); - return has_entries; + gfs2_glock_put(gl); + } } /** - * scan_glock - look at a glock and see if we can reclaim it + * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * */ -static void scan_glock(struct gfs2_glock *gl) +static void clear_glock(struct gfs2_glock *gl) { - if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) - return; - - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - goto out_schedule; - gfs2_glmutex_unlock(gl); - } - return; + gfs2_glock_remove_from_lru(gl); -out_schedule: - gfs2_glmutex_unlock(gl); - gfs2_glock_schedule_for_reclaim(gl); + spin_lock(&gl->gl_spin); + if (gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + spin_unlock(&gl->gl_spin); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); } /** - * clear_glock - look at a glock and see if we can free it from glock cache - * @gl: the glock to look at + * gfs2_glock_thaw - Thaw any frozen glocks + * @sdp: The super block * */ -static void clear_glock(struct gfs2_glock *gl) +void gfs2_glock_thaw(struct gfs2_sbd *sdp) { - struct gfs2_sbd *sdp = gl->gl_sbd; - int released; - - spin_lock(&sdp->sd_reclaim_lock); - if (!list_empty(&gl->gl_reclaim)) { - list_del_init(&gl->gl_reclaim); - atomic_dec(&sdp->sd_reclaim_count); - spin_unlock(&sdp->sd_reclaim_lock); - released = gfs2_glock_put(gl); - gfs2_assert(sdp, !released); - } else { - spin_unlock(&sdp->sd_reclaim_lock); - } + glock_hash_walk(thaw_glock, sdp); +} - if (gfs2_glmutex_trylock(gl)) { - if (list_empty(&gl->gl_holders) && - gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - gfs2_glmutex_unlock(gl); - } +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + gfs2_dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl); } /** @@ -1740,434 +1579,411 @@ static void clear_glock(struct gfs2_glock *gl) * @sdp: the filesystem * @wait: wait until it's all gone * - * Called when unmounting the filesystem, or when inter-node lock manager - * requests DROPLOCKS because it is running out of capacity. + * Called when unmounting the filesystem. */ -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { - unsigned long t; - unsigned int x; - int cont; - - t = jiffies; - - for (;;) { - cont = 0; - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { - if (examine_bucket(clear_glock, sdp, x)) - cont = 1; - } + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); + glock_hash_walk(clear_glock, sdp); + flush_workqueue(glock_workqueue); + wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + glock_hash_walk(dump_glock_func, sdp); +} - if (!wait || !cont) - break; +void gfs2_glock_finish_truncate(struct gfs2_inode *ip) +{ + struct gfs2_glock *gl = ip->i_gl; + int ret; - if (time_after_eq(jiffies, - t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) { - fs_warn(sdp, "Unmount seems to be stalled. " - "Dumping lock state...\n"); - gfs2_dump_lockstate(sdp); - t = jiffies; - } + ret = gfs2_truncatei_resume(ip); + gfs2_assert_withdraw(gl->gl_sbd, ret == 0); - down_write(&gfs2_umount_flush_sem); - invalidate_inodes(sdp->sd_vfs); - up_write(&gfs2_umount_flush_sem); - msleep(10); - } + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); } -/* - * Diagnostic routines to help debug distributed deadlock - */ - -static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt, - unsigned long address) +static const char *state2str(unsigned state) { - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - print_dbg(gi, fmt, buffer); + switch(state) { + case LM_ST_UNLOCKED: + return "UN"; + case LM_ST_SHARED: + return "SH"; + case LM_ST_DEFERRED: + return "DF"; + case LM_ST_EXCLUSIVE: + return "EX"; + } + return "??"; +} + +static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +{ + char *p = buf; + if (flags & LM_FLAG_TRY) + *p++ = 't'; + if (flags & LM_FLAG_TRY_1CB) + *p++ = 'T'; + if (flags & LM_FLAG_NOEXP) + *p++ = 'e'; + if (flags & LM_FLAG_ANY) + *p++ = 'A'; + if (flags & LM_FLAG_PRIORITY) + *p++ = 'p'; + if (flags & GL_ASYNC) + *p++ = 'a'; + if (flags & GL_EXACT) + *p++ = 'E'; + if (flags & GL_NOCACHE) + *p++ = 'c'; + if (test_bit(HIF_HOLDER, &iflags)) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; + if (test_bit(HIF_FIRST, &iflags)) + *p++ = 'F'; + *p = 0; + return buf; } /** * dump_holder - print information about a glock holder - * @str: a string naming the type of holder + * @seq: the seq_file struct * @gh: the glock holder * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct glock_iter *gi, char *str, - struct gfs2_holder *gh) +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { - unsigned int x; - struct task_struct *gh_owner; + struct task_struct *gh_owner = NULL; + char flags_buf[32]; - print_dbg(gi, " %s\n", str); - if (gh->gh_owner_pid) { - print_dbg(gi, " owner = %ld ", - (long)pid_nr(gh->gh_owner_pid)); + rcu_read_lock(); + if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); - if (gh_owner) - print_dbg(gi, "(%s)\n", gh_owner->comm); - else - print_dbg(gi, "(ended)\n"); - } else - print_dbg(gi, " owner = -1\n"); - print_dbg(gi, " gh_state = %u\n", gh->gh_state); - print_dbg(gi, " gh_flags ="); - for (x = 0; x < 32; x++) - if (gh->gh_flags & (1 << x)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); - print_dbg(gi, " error = %d\n", gh->gh_error); - print_dbg(gi, " gh_iflags ="); - for (x = 0; x < 32; x++) - if (test_bit(x, &gh->gh_iflags)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); - gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip); - - return 0; + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); + rcu_read_unlock(); +} + +static const char *gflags2str(char *buf, const struct gfs2_glock *gl) +{ + const unsigned long *gflags = &gl->gl_flags; + char *p = buf; + + if (test_bit(GLF_LOCK, gflags)) + *p++ = 'l'; + if (test_bit(GLF_DEMOTE, gflags)) + *p++ = 'D'; + if (test_bit(GLF_PENDING_DEMOTE, gflags)) + *p++ = 'd'; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) + *p++ = 'p'; + if (test_bit(GLF_DIRTY, gflags)) + *p++ = 'y'; + if (test_bit(GLF_LFLUSH, gflags)) + *p++ = 'f'; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) + *p++ = 'i'; + if (test_bit(GLF_REPLY_PENDING, gflags)) + *p++ = 'r'; + if (test_bit(GLF_INITIAL, gflags)) + *p++ = 'I'; + if (test_bit(GLF_FROZEN, gflags)) + *p++ = 'F'; + if (test_bit(GLF_QUEUED, gflags)) + *p++ = 'q'; + if (test_bit(GLF_LRU, gflags)) + *p++ = 'L'; + if (gl->gl_object) + *p++ = 'o'; + if (test_bit(GLF_BLOCKING, gflags)) + *p++ = 'b'; + *p = 0; + return buf; } /** - * dump_inode - print information about an inode - * @ip: the inode + * gfs2_dump_glock - print information about a glock + * @seq: The seq_file struct + * @gl: the glock + * + * The file format is as follows: + * One line per object, capital letters are used to indicate objects + * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, + * other objects are indented by a single space and follow the glock to + * which they are related. Fields are indicated by lower case letters + * followed by a colon and the field value, except for strings which are in + * [] so that its possible to see if they are composed of spaces for + * example. The field's are n = number (id of the object), f = flags, + * t = type, s = state, r = refcount, e = error, p = pid. * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { - unsigned int x; - - print_dbg(gi, " Inode:\n"); - print_dbg(gi, " num = %llu/%llu\n", - (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr); - print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); - print_dbg(gi, " i_flags ="); - for (x = 0; x < 32; x++) - if (test_bit(x, &ip->i_flags)) - print_dbg(gi, " %u", x); - print_dbg(gi, " \n"); + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned long long dtime; + const struct gfs2_holder *gh; + char gflags_buf[32]; + + dtime = jiffies - gl->gl_demote_time; + dtime *= 1000000/HZ; /* demote time in uSec */ + if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) + dtime = 0; + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n", + state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + (int)gl->gl_lockref.count, gl->gl_hold_time); + + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); + + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) + glops->go_dump(seq, gl); +} + +static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock *gl = iter_ptr; + + seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + (long long)gl->gl_stats.stats[GFS2_LKS_SRTT], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRT], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], + (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); return 0; } -/** - * dump_glock - print information about a glock - * @gl: the glock - * @count: where we are in the buffer - * - * Returns: 0 on success, -ENOBUFS when we run out of space - */ - -static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - unsigned int x; - int error = -ENOBUFS; - struct task_struct *gl_owner; - - spin_lock(&gl->gl_spin); - - print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); - print_dbg(gi, " gl_flags ="); - for (x = 0; x < 32; x++) { - if (test_bit(x, &gl->gl_flags)) - print_dbg(gi, " %u", x); - } - if (!test_bit(GLF_LOCK, &gl->gl_flags)) - print_dbg(gi, " (unlocked)"); - print_dbg(gi, " \n"); - print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref)); - print_dbg(gi, " gl_state = %u\n", gl->gl_state); - if (gl->gl_owner_pid) { - gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID); - if (gl_owner) - print_dbg(gi, " gl_owner = pid %d (%s)\n", - pid_nr(gl->gl_owner_pid), gl_owner->comm); - else - print_dbg(gi, " gl_owner = %d (ended)\n", - pid_nr(gl->gl_owner_pid)); - } else - print_dbg(gi, " gl_owner = -1\n"); - print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip); - print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); - print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); - print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); - print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); - print_dbg(gi, " reclaim = %s\n", - (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); - if (gl->gl_aspace) - print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace, - gl->gl_aspace->i_mapping->nrpages); - else - print_dbg(gi, " aspace = no\n"); - print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count)); - if (gl->gl_req_gh) { - error = dump_holder(gi, "Request", gl->gl_req_gh); - if (error) - goto out; - } - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(gi, "Holder", gh); - if (error) - goto out; - } - list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { - error = dump_holder(gi, "Waiter1", gh); - if (error) - goto out; - } - list_for_each_entry(gh, &gl->gl_waiters3, gh_list) { - error = dump_holder(gi, "Waiter3", gh); - if (error) - goto out; - } - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { - print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n", - gl->gl_demote_state, (unsigned long long) - (jiffies - gl->gl_demote_time)*(1000000/HZ)); - } - if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { - if (!test_bit(GLF_LOCK, &gl->gl_flags) && - list_empty(&gl->gl_holders)) { - error = dump_inode(gi, gl->gl_object); - if (error) - goto out; - } else { - error = -ENOBUFS; - print_dbg(gi, " Inode: busy\n"); - } - } - - error = 0; +static const char *gfs2_gltype[] = { + "type", + "reserved", + "nondisk", + "inode", + "rgrp", + "meta", + "iopen", + "flock", + "plock", + "quota", + "journal", +}; -out: - spin_unlock(&gl->gl_spin); - return error; -} +static const char *gfs2_stype[] = { + [GFS2_LKS_SRTT] = "srtt", + [GFS2_LKS_SRTTVAR] = "srttvar", + [GFS2_LKS_SRTTB] = "srttb", + [GFS2_LKS_SRTTVARB] = "srttvarb", + [GFS2_LKS_SIRT] = "sirt", + [GFS2_LKS_SIRTVAR] = "sirtvar", + [GFS2_LKS_DCOUNT] = "dlm", + [GFS2_LKS_QCOUNT] = "queue", +}; -/** - * gfs2_dump_lockstate - print out the current lockstate - * @sdp: the filesystem - * @ub: the buffer to copy the information into - * - * If @ub is NULL, dump the lockstate to the console. - * - */ +#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) +static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) { - struct gfs2_glock *gl; - struct hlist_node *h; - unsigned int x; - int error = 0; + struct gfs2_glock_iter *gi = seq->private; + struct gfs2_sbd *sdp = gi->sdp; + unsigned index = gi->hash >> 3; + unsigned subindex = gi->hash & 0x07; + s64 value; + int i; - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { - - read_lock(gl_lock_addr(x)); + if (index == 0 && subindex != 0) + return 0; - hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) { - if (gl->gl_sbd != sdp) - continue; + seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], + (index == 0) ? "cpu": gfs2_stype[subindex]); - error = dump_glock(NULL, gl); - if (error) - break; + for_each_possible_cpu(i) { + const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); + if (index == 0) { + value = i; + } else { + value = lkstats->lkstats[index - 1].stats[subindex]; } - - read_unlock(gl_lock_addr(x)); - - if (error) - break; + seq_printf(seq, " %15lld", (long long)value); } - - - return error; -} - -/** - * gfs2_scand - Look for cached glocks and inodes to toss from memory - * @sdp: Pointer to GFS2 superblock - * - * One of these daemons runs, finding candidates to add to sd_reclaim_list. - * See gfs2_glockd() - */ - -static int gfs2_scand(void *data) -{ - unsigned x; - unsigned delay; - - while (!kthread_should_stop()) { - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(scan_glock, NULL, x); - if (freezing(current)) - refrigerator(); - delay = scand_secs; - if (delay < 1) - delay = 1; - schedule_timeout_interruptible(delay * HZ); - } - + seq_putc(seq, '\n'); return 0; } - - int __init gfs2_glock_init(void) { unsigned i; for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { - INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); - } -#ifdef GL_HASH_LOCK_SZ - for(i = 0; i < GL_HASH_LOCK_SZ; i++) { - rwlock_init(&gl_hash_locks[i]); + INIT_HLIST_BL_HEAD(&gl_hash_table[i]); } -#endif - scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand"); - if (IS_ERR(scand_process)) - return PTR_ERR(scand_process); - - glock_workqueue = create_workqueue("glock_workqueue"); - if (IS_ERR(glock_workqueue)) { - kthread_stop(scand_process); - return PTR_ERR(glock_workqueue); + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZABLE, 0); + if (!glock_workqueue) + return -ENOMEM; + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZABLE, + 0); + if (!gfs2_delete_workqueue) { + destroy_workqueue(glock_workqueue); + return -ENOMEM; } + register_shrinker(&glock_shrinker); + return 0; } void gfs2_glock_exit(void) { + unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); - kthread_stop(scand_process); + destroy_workqueue(gfs2_delete_workqueue); } -module_param(scand_secs, uint, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); - -static int gfs2_glock_iter_next(struct glock_iter *gi) +static inline struct gfs2_glock *glock_hash_chain(unsigned hash) { - struct gfs2_glock *gl; - -restart: - read_lock(gl_lock_addr(gi->hash)); - gl = gi->gl; - if (gl) { - gi->gl = hlist_entry(gl->gl_list.next, - struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); - } - read_unlock(gl_lock_addr(gi->hash)); - if (gl) - gfs2_glock_put(gl); - if (gl && gi->gl == NULL) - gi->hash++; - while(gi->gl == NULL) { - if (gi->hash >= GFS2_GL_HASH_SIZE) - return 1; - read_lock(gl_lock_addr(gi->hash)); - gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, - struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); - read_unlock(gl_lock_addr(gi->hash)); - gi->hash++; - } - - if (gi->sdp != gi->gl->gl_sbd) - goto restart; - - return 0; + return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), + struct gfs2_glock, gl_list); } -static void gfs2_glock_iter_free(struct glock_iter *gi) +static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) { - if (gi->gl) - gfs2_glock_put(gi->gl); - kfree(gi); + return hlist_bl_entry(rcu_dereference(gl->gl_list.next), + struct gfs2_glock, gl_list); } -static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp) +static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - struct glock_iter *gi; - - gi = kmalloc(sizeof (*gi), GFP_KERNEL); - if (!gi) - return NULL; - - gi->sdp = sdp; - gi->hash = 0; - gi->seq = NULL; - gi->gl = NULL; - memset(gi->string, 0, sizeof(gi->string)); + struct gfs2_glock *gl; - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); - return NULL; - } + do { + gl = gi->gl; + if (gl) { + gi->gl = glock_hash_next(gl); + gi->nhash++; + } else { + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; + } + while (gi->gl == NULL) { + gi->hash++; + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + gi->nhash = 0; + } + /* Skip entries for other sb and dead entries */ + } while (gi->sdp != gi->gl->gl_sbd || + __lockref_is_dead(&gi->gl->gl_lockref)); - return gi; + return 0; } -static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos) +static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { - struct glock_iter *gi; + struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - gi = gfs2_glock_iter_init(file->private); - if (!gi) - return NULL; + if (gi->last_pos <= *pos) + n = gi->nhash + (*pos - gi->last_pos); + else + gi->hash = 0; + + gi->nhash = 0; + rcu_read_lock(); - while(n--) { - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + do { + if (gfs2_glock_iter_next(gi)) return NULL; - } - } + } while (n--); - return gi; + gi->last_pos = *pos; + return gi->gl; } -static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr, +static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { - struct glock_iter *gi = iter_ptr; + struct gfs2_glock_iter *gi = seq->private; (*pos)++; - - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + gi->last_pos = *pos; + if (gfs2_glock_iter_next(gi)) return NULL; - } - return gi; + return gi->gl; +} + +static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock_iter *gi = seq->private; + + if (gi->gl) + rcu_read_unlock(); + gi->gl = NULL; } -static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) +static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - struct glock_iter *gi = iter_ptr; - if (gi) - gfs2_glock_iter_free(gi); + dump_glock(seq, iter_ptr); + return 0; } -static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) +static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) { - struct glock_iter *gi = iter_ptr; + struct gfs2_glock_iter *gi = seq->private; - gi->seq = file; - dump_glock(gi, gi->gl); + gi->hash = *pos; + if (*pos >= GFS2_NR_SBSTATS) + return NULL; + preempt_disable(); + return SEQ_START_TOKEN; +} - return 0; +static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + (*pos)++; + gi->hash++; + if (gi->hash >= GFS2_NR_SBSTATS) { + preempt_enable(); + return NULL; + } + return SEQ_START_TOKEN; +} + +static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + preempt_enable(); } static const struct seq_operations gfs2_glock_seq_ops = { @@ -2177,27 +1993,86 @@ static const struct seq_operations gfs2_glock_seq_ops = { .show = gfs2_glock_seq_show, }; -static int gfs2_debugfs_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int ret; +static const struct seq_operations gfs2_glstats_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glstats_seq_show, +}; - ret = seq_open(file, &gfs2_glock_seq_ops); - if (ret) - return ret; +static const struct seq_operations gfs2_sbstats_seq_ops = { + .start = gfs2_sbstats_seq_start, + .next = gfs2_sbstats_seq_next, + .stop = gfs2_sbstats_seq_stop, + .show = gfs2_sbstats_seq_show, +}; - seq = file->private_data; - seq->private = inode->i_private; +#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) - return 0; +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glock_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; + } + return ret; +} + +static int gfs2_glstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; + } + return ret; } -static const struct file_operations gfs2_debug_fops = { +static int gfs2_sbstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_sbstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; +} + +static const struct file_operations gfs2_glocks_fops = { + .owner = THIS_MODULE, + .open = gfs2_glocks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations gfs2_glstats_fops = { .owner = THIS_MODULE, - .open = gfs2_debugfs_open, + .open = gfs2_glstats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release + .release = seq_release_private, +}; + +static const struct file_operations gfs2_sbstats_fops = { + .owner = THIS_MODULE, + .open = gfs2_sbstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, }; int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) @@ -2208,20 +2083,45 @@ int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) sdp->debugfs_dentry_glocks = debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, - &gfs2_debug_fops); + &gfs2_glocks_fops); if (!sdp->debugfs_dentry_glocks) - return -ENOMEM; + goto fail; + + sdp->debugfs_dentry_glstats = debugfs_create_file("glstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + if (!sdp->debugfs_dentry_glstats) + goto fail; + + sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); + if (!sdp->debugfs_dentry_sbstats) + goto fail; return 0; +fail: + gfs2_delete_debugfs_file(sdp); + return -ENOMEM; } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { - if (sdp && sdp->debugfs_dir) { + if (sdp->debugfs_dir) { if (sdp->debugfs_dentry_glocks) { debugfs_remove(sdp->debugfs_dentry_glocks); sdp->debugfs_dentry_glocks = NULL; } + if (sdp->debugfs_dentry_glstats) { + debugfs_remove(sdp->debugfs_dentry_glstats); + sdp->debugfs_dentry_glstats = NULL; + } + if (sdp->debugfs_dentry_sbstats) { + debugfs_remove(sdp->debugfs_dentry_sbstats); + sdp->debugfs_dentry_sbstats = NULL; + } debugfs_remove(sdp->debugfs_dir); sdp->debugfs_dir = NULL; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2f9c6d136b3..32572f71f02 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -11,45 +11,149 @@ #define __GLOCK_DOT_H__ #include <linux/sched.h> +#include <linux/parser.h> #include "incore.h" -/* Flags for lock requests; used in gfs2_holder gh_flag field. - From lm_interface.h: +/* Options for hostdata parser */ + +enum { + Opt_jid, + Opt_id, + Opt_first, + Opt_nodir, + Opt_err, +}; + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + #define LM_FLAG_TRY 0x00000001 #define LM_FLAG_TRY_1CB 0x00000002 #define LM_FLAG_NOEXP 0x00000004 #define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 */ - +#define LM_FLAG_PRIORITY 0x00000010 #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 -#define GL_FLOCK 0x00000800 -#define GL_NOCANCEL 0x00001000 + +/* + * lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ERROR 0x00000004 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 #define GLR_TRYFAILED 13 -#define GLR_CANCELED 14 -static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +#define GL_GLOCK_MAX_HOLD (long)(HZ / 5) +#define GL_GLOCK_DFT_HOLD (long)(HZ / 5) +#define GL_GLOCK_MIN_HOLD (long)(10) +#define GL_GLOCK_HOLD_INCR (long)(HZ / 20) +#define GL_GLOCK_HOLD_DECR (long)(HZ / 40) + +struct lm_lockops { + const char *lm_proto_name; + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); + void (*lm_withdraw) (struct gfs2_sbd *sdp); + void (*lm_put_lock) (struct gfs2_glock *gl); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); + void (*lm_cancel) (struct gfs2_glock *gl); + const match_table_t *lm_tokens; +}; + +extern struct workqueue_struct *gfs2_delete_workqueue; +static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; - int locked = 0; struct pid *pid; /* Look in glock's list of holders for one with current task as owner */ spin_lock(&gl->gl_spin); pid = task_pid(current); list_for_each_entry(gh, &gl->gl_holders, gh_list) { - if (gh->gh_owner_pid == pid) { - locked = 1; + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) break; - } + if (gh->gh_owner_pid == pid) + goto out; } + gh = NULL; +out: spin_unlock(&gl->gl_spin); - return locked; + return gh; } static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) @@ -67,42 +171,41 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) return gl->gl_state == LM_ST_SHARED; } -static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) +static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) { - int ret; - spin_lock(&gl->gl_spin); - ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3); - spin_unlock(&gl->gl_spin); - return ret; + if (gl->gl_ops->go_flags & GLOF_ASPACE) + return (struct address_space *)(gl + 1); + return NULL; } -int gfs2_glock_get(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - int create, struct gfs2_glock **glp); -void gfs2_glock_hold(struct gfs2_glock *gl); -int gfs2_glock_put(struct gfs2_glock *gl); -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_reinit(unsigned int state, unsigned flags, - struct gfs2_holder *gh); -void gfs2_holder_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq(struct gfs2_holder *gh); -int gfs2_glock_poll(struct gfs2_holder *gh); -int gfs2_glock_wait(struct gfs2_holder *gh); -void gfs2_glock_dq(struct gfs2_holder *gh); -void gfs2_glock_dq_wait(struct gfs2_holder *gh); - -void gfs2_glock_dq_uninit(struct gfs2_holder *gh); -int gfs2_glock_nq_num(struct gfs2_sbd *sdp, - u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, struct gfs2_holder *gh); - -int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + unsigned flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +extern void gfs2_holder_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq(struct gfs2_holder *gh); +extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq(struct gfs2_holder *gh); +extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, + struct gfs2_holder *gh); +extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) +extern __printf(2, 3) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** - * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags @@ -126,23 +229,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, return error; } -/* Lock Value Block functions */ - -int gfs2_lvb_hold(struct gfs2_glock *gl); -void gfs2_lvb_unhold(struct gfs2_glock *gl); - -void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); -void gfs2_reclaim_glock(struct gfs2_sbd *sdp); -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); -int __init gfs2_glock_init(void); -void gfs2_glock_exit(void); +extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern int gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); -int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); -void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -int gfs2_register_debugfs(void); -void gfs2_unregister_debugfs(void); +extern const struct lm_lockops gfs2_dlm_ops; #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c663b7a0f41..2ffc67dce87 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -1,18 +1,18 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/bio.h> +#include <linux/posix_acl.h> #include "gfs2.h" #include "incore.h" @@ -26,74 +26,111 @@ #include "rgrp.h" #include "util.h" #include "trans.h" +#include "dir.h" + +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_page->mapping, bh->b_page->flags); + fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); +} /** - * ail_empty_gl - remove all buffers for a given lock from the AIL + * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) * * None of the buffers should be dirty, locked, or pinned. */ -static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) { struct gfs2_sbd *sdp = gl->gl_sbd; - unsigned int blocks; struct list_head *head = &gl->gl_ail_list; - struct gfs2_bufdata *bd; + struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; - int error; - - blocks = atomic_read(&gl->gl_ail_count); - if (!blocks) - return; - - error = gfs2_trans_begin(sdp, 0, blocks); - if (gfs2_assert_withdraw(sdp, !error)) - return; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); gfs2_log_lock(sdp); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, - bd_ail_gl_list); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; bh = bd->bd_bh; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; - bh->b_private = NULL; - bd->bd_blkno = bh->b_blocknr; - gfs2_assert_withdraw(sdp, !buffer_busy(bh)); + if (bh->b_state & b_state) { + if (fsync) + continue; + gfs2_ail_error(gl, bh); + } gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; } - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); +} + + +static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_trans tr; + + memset(&tr, 0, sizeof(tr)); + INIT_LIST_HEAD(&tr.tr_buf); + INIT_LIST_HEAD(&tr.tr_databuf); + tr.tr_revokes = atomic_read(&gl->gl_ail_count); + + if (!tr.tr_revokes) + return; + + /* A shortened, inline version of gfs2_trans_begin() + * tr->alloced is not set since the transaction structure is + * on the stack */ + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_ip = (unsigned long)__builtin_return_address(0); + sb_start_intwrite(sdp->sd_vfs); + if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { + sb_end_intwrite(sdp->sd_vfs); + return; + } + WARN_ON_ONCE(current->journal_info); + current->journal_info = &tr; + + __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } -/** - * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock - * @gl: the glock - * - */ - -static void gfs2_pte_inval(struct gfs2_glock *gl) +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { - struct gfs2_inode *ip; - struct inode *inode; + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + int ret; - ip = gl->gl_object; - inode = &ip->i_inode; - if (!ip || !S_ISREG(inode->i_mode)) + if (!revokes) return; - unmap_shared_mapping_range(inode->i_mapping, 0, 0); - if (test_bit(GIF_SW_PAGED, &ip->i_flags)) - set_bit(GLF_DIRTY, &gl->gl_flags); + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + ret = gfs2_trans_begin(sdp, 0, max_revokes); + if (ret) + return; + __gfs2_ail_flush(gl, fsync, max_revokes); + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } /** - * meta_go_sync - sync out the metadata for this glock + * rgrp_go_sync - sync out the metadata for this glock * @gl: the glock * * Called when demoting or unlocking an EX glock. We must flush @@ -101,32 +138,53 @@ static void gfs2_pte_inval(struct gfs2_glock *gl) * not return to caller to demote/unlock the glock until I/O is complete. */ -static void meta_go_sync(struct gfs2_glock *gl) +static void rgrp_go_sync(struct gfs2_glock *gl) { - if (gl->gl_state != LM_ST_EXCLUSIVE) - return; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd; + int error; - if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - gfs2_meta_sync(gl); - gfs2_ail_empty_gl(gl); - } + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(sdp, gl, NORMAL_FLUSH); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); + gfs2_ail_empty_gl(gl); + + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_free_clones(rgd); + spin_unlock(&gl->gl_spin); } /** - * meta_go_inval - invalidate the metadata for this glock + * rgrp_go_inval - invalidate the metadata for this glock * @gl: the glock * @flags: * + * We never used LM_ST_DEFERRED with resource groups, so that we + * should always see the metadata flag set here. + * */ -static void meta_go_inval(struct gfs2_glock *gl, int flags) +static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - if (!(flags & DIO_METADATA)) - return; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; + + WARN_ON_ONCE(!(flags & DIO_METADATA)); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - gfs2_meta_inval(gl); - gl->gl_vn++; + if (gl->gl_object) { + struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + } } /** @@ -138,51 +196,38 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags) static void inode_go_sync(struct gfs2_glock *gl) { struct gfs2_inode *ip = gl->gl_object; - struct address_space *metamapping = gl->gl_aspace->i_mapping; + struct address_space *metamapping = gfs2_glock2aspace(gl); int error; - if (gl->gl_state != LM_ST_UNLOCKED) - gfs2_pte_inval(gl); - if (gl->gl_state != LM_ST_EXCLUSIVE) - return; - if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - - if (test_bit(GLF_DIRTY, &gl->gl_flags)) { - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - if (ip) { - struct address_space *mapping = ip->i_inode.i_mapping; - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - mapping_set_error(mapping, error); - } - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); - clear_bit(GLF_DIRTY, &gl->gl_flags); - gfs2_ail_empty_gl(gl); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); } -} - -/** - * inode_go_xmote_bh - After promoting/demoting a glock - * @gl: the glock - * - */ + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return; -static void inode_go_xmote_bh(struct gfs2_glock *gl) -{ - struct gfs2_holder *gh = gl->gl_req_gh; - struct buffer_head *bh; - int error; + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - if (gl->gl_state != LM_ST_UNLOCKED && - (!gh || !(gh->gh_flags & GL_SKIP))) { - error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh); - if (!error) - brelse(bh); + gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH); + filemap_fdatawrite(metamapping); + if (ip) { + struct address_space *mapping = ip->i_inode.i_mapping; + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + mapping_set_error(mapping, error); } + error = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, error); + gfs2_ail_empty_gl(gl); + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_atomic(); + clear_bit(GLF_DIRTY, &gl->gl_flags); } /** @@ -190,19 +235,32 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl) * @gl: the glock * @flags: * + * Normally we invalidate everything, but if we are moving into + * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we + * can keep hold of the metadata, since it won't have changed. + * */ static void inode_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_inode *ip = gl->gl_object; - int meta = (flags & DIO_METADATA); - if (meta) { - gfs2_meta_inval(gl); - if (ip) + gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + + if (flags & DIO_METADATA) { + struct address_space *mapping = gfs2_glock2aspace(gl); + truncate_inode_pages(mapping, 0); + if (ip) { set_bit(GIF_INVALID, &ip->i_flags); + forget_all_cached_acls(&ip->i_inode); + gfs2_dir_hash_inval(ip); + } } + if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH); + gl->gl_sbd->sd_rindex_uptodate = 0; + } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); } @@ -214,19 +272,130 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) * Returns: 1 if it's ok */ -static int inode_go_demote_ok(struct gfs2_glock *gl) +static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - int demote = 0; + struct gfs2_holder *gh; + + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; - if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) - demote = 1; - else if (!sdp->sd_args.ar_localcaching && - time_after_eq(jiffies, gl->gl_stamp + - gfs2_tune_get(sdp, gt_demote_secs) * HZ)) - demote = 1; + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (gh->gh_list.next != &gl->gl_holders) + return 0; + } - return demote; + return 1; +} + +/** + * gfs2_set_nlink - Set the inode's link count based on on-disk info + * @inode: The inode in question + * @nlink: The link count + * + * If the link count has hit zero, it must never be raised, whatever the + * on-disk inode might say. When new struct inodes are created the link + * count is set to 1, so that we can safely use this test even when reading + * in on disk information for the first time. + */ + +static void gfs2_set_nlink(struct inode *inode, u32 nlink) +{ + /* + * We will need to review setting the nlink count here in the + * light of the forthcoming ro bind mount work. This is a reminder + * to do that. + */ + if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { + if (nlink == 0) + clear_nlink(inode); + else + set_nlink(inode, nlink); + } +} + +static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +{ + const struct gfs2_dinode *str = buf; + struct timespec atime; + u16 height, depth; + + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); + ip->i_inode.i_mode = be32_to_cpu(str->di_mode); + ip->i_inode.i_rdev = 0; + switch (ip->i_inode.i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + }; + + i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); + i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); + gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; + ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + + ip->i_goal = be64_to_cpu(str->di_goal_meta); + ip->i_generation = be64_to_cpu(str->di_generation); + + ip->i_diskflags = be32_to_cpu(str->di_flags); + ip->i_eattr = be64_to_cpu(str->di_eattr); + /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ + gfs2_set_inode_flags(&ip->i_inode); + height = be16_to_cpu(str->di_height); + if (unlikely(height > GFS2_MAX_META_HEIGHT)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; + ip->i_entries = be32_to_cpu(str->di_entries); + + if (S_ISREG(ip->i_inode.i_mode)) + gfs2_set_aops(&ip->i_inode); + + return 0; +corrupt: + gfs2_consist_inode(ip); + return -EIO; +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + error = gfs2_dinode_in(ip, dibh->b_data); + brelse(dibh); + clear_bit(GIF_INVALID, &ip->i_flags); + + return error; } /** @@ -240,10 +409,11 @@ static int inode_go_demote_ok(struct gfs2_glock *gl) static int inode_go_lock(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; - if (!ip) + if (!ip || (gh->gh_flags & GL_SKIP)) return 0; if (test_bit(GIF_INVALID, &ip->i_flags)) { @@ -252,79 +422,77 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } - if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_state == LM_ST_EXCLUSIVE)) - error = gfs2_truncatei_resume(ip); + (gh->gh_state == LM_ST_EXCLUSIVE)) { + spin_lock(&sdp->sd_trunc_lock); + if (list_empty(&ip->i_trunc_list)) + list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + wake_up(&sdp->sd_quota_wait); + return 1; + } return error; } /** - * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock - * @gl: the glock + * inode_go_dump - print information about an inode + * @seq: The iterator + * @ip: the inode * - * Returns: 1 if it's ok */ -static int rgrp_go_demote_ok(struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) { - return !gl->gl_aspace->i_mapping->nrpages; -} - -/** - * rgrp_go_lock - operation done after an rgrp lock is locked by - * a first holder on this node. - * @gl: the glock - * @flags: - * - * Returns: errno - */ - -static int rgrp_go_lock(struct gfs2_holder *gh) -{ - return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); -} - -/** - * rgrp_go_unlock - operation done before an rgrp lock is unlocked by - * a last holder on this node. - * @gl: the glock - * @flags: - * - */ - -static void rgrp_go_unlock(struct gfs2_holder *gh) -{ - gfs2_rgrp_bh_put(gh->gh_gl->gl_object); + const struct gfs2_inode *ip = gl->gl_object; + if (ip == NULL) + return; + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + IF2DT(ip->i_inode.i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)i_size_read(&ip->i_inode)); } /** - * trans_go_sync - promote/demote the transaction glock + * freeze_go_sync - promote/demote the freeze glock * @gl: the glock * @state: the requested state * @flags: * */ -static void trans_go_sync(struct gfs2_glock *gl) +static void freeze_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; + DEFINE_WAIT(wait); - if (gl->gl_state != LM_ST_UNLOCKED && + if (gl->gl_state == LM_ST_SHARED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); + atomic_set(&sdp->sd_log_freeze, 1); + wake_up(&sdp->sd_logd_waitq); + do { + prepare_to_wait(&sdp->sd_log_frozen_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_freeze)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_freeze)); + finish_wait(&sdp->sd_log_frozen_wait, &wait); } } /** - * trans_go_xmote_bh - After promoting/demoting the transaction glock + * freeze_go_xmote_bh - After promoting/demoting the freeze glock * @gl: the glock * */ -static void trans_go_xmote_bh(struct gfs2_glock *gl) +static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) { struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); @@ -332,8 +500,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) struct gfs2_log_header_host head; int error; - if (gl->gl_state != LM_ST_UNLOCKED && - test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -348,53 +515,77 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl) gfs2_log_pointers_init(sdp, head.lh_blkno); } } + return 0; } /** - * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock + * trans_go_demote_ok * @gl: the glock * - * Returns: 1 if it's ok + * Always returns 0 */ -static int quota_go_demote_ok(struct gfs2_glock *gl) +static int freeze_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { - return !atomic_read(&gl->gl_lvb_count); + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) + return; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && ip) { + gl->gl_lockref.count++; + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gl->gl_lockref.count--; + } } const struct gfs2_glock_operations gfs2_meta_glops = { - .go_xmote_th = meta_go_sync, .go_type = LM_TYPE_META, }; const struct gfs2_glock_operations gfs2_inode_glops = { - .go_xmote_th = inode_go_sync, - .go_xmote_bh = inode_go_xmote_bh, + .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, .go_lock = inode_go_lock, + .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_min_hold_time = HZ / 10, + .go_flags = GLOF_ASPACE, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { - .go_xmote_th = meta_go_sync, - .go_inval = meta_go_inval, - .go_demote_ok = rgrp_go_demote_ok, - .go_lock = rgrp_go_lock, - .go_unlock = rgrp_go_unlock, + .go_sync = rgrp_go_sync, + .go_inval = rgrp_go_inval, + .go_lock = gfs2_rgrp_go_lock, + .go_unlock = gfs2_rgrp_go_unlock, + .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_min_hold_time = HZ / 10, + .go_flags = GLOF_LVB, }; -const struct gfs2_glock_operations gfs2_trans_glops = { - .go_xmote_th = trans_go_sync, - .go_xmote_bh = trans_go_xmote_bh, +const struct gfs2_glock_operations gfs2_freeze_glops = { + .go_sync = freeze_go_sync, + .go_xmote_bh = freeze_go_xmote_bh, + .go_demote_ok = freeze_go_demote_ok, .go_type = LM_TYPE_NONDISK, }; const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, }; const struct gfs2_glock_operations gfs2_flock_glops = { @@ -406,11 +597,22 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { }; const struct gfs2_glock_operations gfs2_quota_glops = { - .go_demote_ok = quota_go_demote_ok, .go_type = LM_TYPE_QUOTA, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_journal_glops = { .go_type = LM_TYPE_JOURNAL, }; +const struct gfs2_glock_operations *gfs2_glops_list[] = { + [LM_TYPE_META] = &gfs2_meta_glops, + [LM_TYPE_INODE] = &gfs2_inode_glops, + [LM_TYPE_RGRP] = &gfs2_rgrp_glops, + [LM_TYPE_IOPEN] = &gfs2_iopen_glops, + [LM_TYPE_FLOCK] = &gfs2_flock_glops, + [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, + [LM_TYPE_QUOTA] = &gfs2_quota_glops, + [LM_TYPE_JOURNAL] = &gfs2_journal_glops, +}; + diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index a1d9b5b024e..7455d2629bc 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -15,11 +15,14 @@ extern const struct gfs2_glock_operations gfs2_meta_glops; extern const struct gfs2_glock_operations gfs2_inode_glops; extern const struct gfs2_glock_operations gfs2_rgrp_glops; -extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_freeze_glops; extern const struct gfs2_glock_operations gfs2_iopen_glops; extern const struct gfs2_glock_operations gfs2_flock_glops; extern const struct gfs2_glock_operations gfs2_nondisk_glops; extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; +extern const struct gfs2_glock_operations *gfs2_glops_list[]; + +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 525dcae352d..67d310c9ada 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -11,21 +11,30 @@ #define __INCORE_DOT_H__ #include <linux/fs.h> +#include <linux/kobject.h> #include <linux/workqueue.h> +#include <linux/dlm.h> +#include <linux/buffer_head.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/ktime.h> +#include <linux/percpu.h> +#include <linux/lockref.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 -#define DIO_ALL 0x00000100 struct gfs2_log_operations; -struct gfs2_log_element; +struct gfs2_bufdata; struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; struct gfs2_trans; -struct gfs2_ail; struct gfs2_jdesc; struct gfs2_sbd; +struct lm_lockops; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); @@ -43,10 +52,8 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); - void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); - void (*lo_before_commit) (struct gfs2_sbd *sdp); - void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, @@ -56,94 +63,189 @@ struct gfs2_log_operations { const char *lo_name; }; -struct gfs2_log_element { - struct list_head le_list; - const struct gfs2_log_operations *le_ops; -}; +#define GBF_FULL 1 struct gfs2_bitmap { struct buffer_head *bi_bh; char *bi_clone; + unsigned long bi_flags; u32 bi_offset; u32 bi_start; u32 bi_len; -}; - -struct gfs2_rgrp_host { - u32 rg_flags; - u32 rg_free; - u32 rg_dinodes; - u64 rg_igeneration; + u32 bi_blocks; }; struct gfs2_rgrpd { - struct list_head rd_list; /* Link with superblock */ - struct list_head rd_list_mru; - struct list_head rd_recent; /* Recently used rgrps */ + struct rb_node rd_node; /* Link with superblock */ struct gfs2_glock *rd_gl; /* Glock for this rgrp */ u64 rd_addr; /* grp block disk address */ u64 rd_data0; /* first data location */ u32 rd_length; /* length of rgrp header in fs blocks */ u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ - struct gfs2_rgrp_host rd_rg; - u64 rd_rg_vn; - struct gfs2_bitmap *rd_bits; - unsigned int rd_bh_count; - struct mutex rd_mutex; + u32 rd_free; + u32 rd_reserved; /* number of blocks reserved */ u32 rd_free_clone; - struct gfs2_log_element rd_le; - u32 rd_last_alloc_data; - u32 rd_last_alloc_meta; + u32 rd_dinodes; + u64 rd_igeneration; + struct gfs2_bitmap *rd_bits; struct gfs2_sbd *rd_sbd; - unsigned long rd_flags; -#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */ + struct gfs2_rgrp_lvb *rd_rgl; + u32 rd_last_alloc; + u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ + spinlock_t rd_rsspin; /* protects reservation related vars */ + struct rb_root rd_rstree; /* multi-block reservation tree */ +}; + +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ }; +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + +static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, + const struct gfs2_rbm *rbm2) +{ + return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && + (rbm1->offset == rbm2->offset); +} + enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, + BH_Zeronew = BH_PrivateStart + 2, }; BUFFER_FNS(Pinned, pinned) TAS_BUFFER_FNS(Pinned, pinned) BUFFER_FNS(Escaped, escaped) TAS_BUFFER_FNS(Escaped, escaped) +BUFFER_FNS(Zeronew, zeronew) +TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; struct gfs2_glock *bd_gl; + u64 bd_blkno; - union { - struct list_head list_tr; - u64 blkno; - } u; -#define bd_list_tr u.list_tr -#define bd_blkno u.blkno - - struct gfs2_log_element bd_le; + struct list_head bd_list; + const struct gfs2_log_operations *bd_ops; - struct gfs2_ail *bd_ail; + struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; struct list_head bd_ail_gl_list; }; +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. + */ + +#define GDLM_STRNAME_BYTES 25 +#define GDLM_LVB_SIZE 32 + +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + +enum { + DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, +}; + +struct lm_lockname { + u64 ln_number; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type)) + + struct gfs2_glock_operations { - void (*go_xmote_th) (struct gfs2_glock *gl); - void (*go_xmote_bh) (struct gfs2_glock *gl); + void (*go_sync) (struct gfs2_glock *gl); + int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); - int (*go_demote_ok) (struct gfs2_glock *gl); + int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; - const unsigned long go_min_hold_time; + const unsigned long go_flags; +#define GLOF_ASPACE 1 +#define GLOF_LVB 2 +}; + +enum { + GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */ + GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */ + GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */ + GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */ + GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */ + GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */ + GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */ + GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */ + GFS2_NR_LKSTATS +}; + +struct gfs2_lkstats { + s64 stats[GFS2_NR_LKSTATS]; }; enum { /* States */ - HIF_HOLDER = 6, + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_FIRST = 7, - HIF_ABORTED = 9, HIF_WAIT = 10, }; @@ -156,118 +258,143 @@ struct gfs2_holder { unsigned gh_flags; int gh_error; - unsigned long gh_iflags; + unsigned long gh_iflags; /* HIF_... */ unsigned long gh_ip; }; +/* Resource group multi-block reservation, in order of appearance: + + Step 1. Function prepares to write, allocates a mb, sets the size hint. + Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info + Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use + Step 4. Bits are assigned from the rgrp based on either the reservation + or wherever it can. +*/ + +struct gfs2_blkreserv { + /* components used during write (step 1): */ + atomic_t rs_sizehint; /* hint of the write size */ + + struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */ + struct rb_node rs_node; /* link to other block reservations */ + struct gfs2_rbm rs_rbm; /* Start of reservation */ + u32 rs_free; /* how many blocks are still free */ + u64 rs_inum; /* Inode number for reservation */ + + /* ancillary quota stuff */ + struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; + unsigned int rs_qa_qd_num; +}; + +/* + * Allocation parameters + * @target: The number of blocks we'd ideally like to allocate + * @aflags: The flags (e.g. Orlov flag) + * + * The intent is to gradually expand this structure over time in + * order to give more information, e.g. alignment, min extent size + * to the allocation code. + */ +struct gfs2_alloc_parms { + u32 target; + u32 aflags; +}; + enum { - GLF_LOCK = 1, - GLF_STICKY = 2, - GLF_DEMOTE = 3, - GLF_PENDING_DEMOTE = 4, - GLF_DIRTY = 5, - GLF_DEMOTE_IN_PROGRESS = 6, - GLF_LFLUSH = 7, + GLF_LOCK = 1, + GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, + GLF_DEMOTE_IN_PROGRESS = 5, + GLF_DIRTY = 6, + GLF_LFLUSH = 7, + GLF_INVALIDATE_IN_PROGRESS = 8, + GLF_REPLY_PENDING = 9, + GLF_INITIAL = 10, + GLF_FROZEN = 11, + GLF_QUEUED = 12, + GLF_LRU = 13, + GLF_OBJECT = 14, /* Used only for tracing */ + GLF_BLOCKING = 15, }; struct gfs2_glock { - struct hlist_node gl_list; + struct hlist_bl_node gl_list; + struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; - atomic_t gl_ref; - spinlock_t gl_spin; + struct lockref gl_lockref; +#define gl_spin gl_lockref.lock + + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ - unsigned int gl_state; unsigned int gl_hash; - unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ - struct pid *gl_owner_pid; - unsigned long gl_ip; + long gl_hold_time; struct list_head gl_holders; - struct list_head gl_waiters1; /* HIF_MUTEX */ - struct list_head gl_waiters3; /* HIF_PROMOTE */ - int gl_waiters2; /* GIF_DEMOTE */ const struct gfs2_glock_operations *gl_ops; - - struct gfs2_holder *gl_req_gh; - gfs2_glop_bh_t gl_req_bh; - - void *gl_lock; - char *gl_lvb; - atomic_t gl_lvb_count; - - u64 gl_vn; - unsigned long gl_stamp; + ktime_t gl_dstamp; + struct gfs2_lkstats gl_stats; + struct dlm_lksb gl_lksb; unsigned long gl_tchange; void *gl_object; - struct list_head gl_reclaim; - - struct gfs2_sbd *gl_sbd; - - struct inode *gl_aspace; + struct list_head gl_lru; struct list_head gl_ail_list; atomic_t gl_ail_count; + atomic_t gl_revokes; struct delayed_work gl_work; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; + struct rcu_head gl_rcu; }; -struct gfs2_alloc { - /* Quota stuff */ - - struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; - struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; - unsigned int al_qd_num; - - u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ - u32 al_alloced; /* Filled in by gfs2_alloc_*() */ - - /* Filled in by gfs2_inplace_reserve() */ - - unsigned int al_line; - char *al_file; - struct gfs2_holder al_ri_gh; - struct gfs2_holder al_rgd_gh; - struct gfs2_rgrpd *al_rgd; - -}; +#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, -}; - -struct gfs2_dinode_host { - u64 di_size; /* number of bytes in file */ - u64 di_blocks; /* number of blocks in file */ - u64 di_goal_meta; /* rgrp to alloc from next */ - u64 di_goal_data; /* data block goal */ - u64 di_generation; /* generation number for NFS */ - u32 di_flags; /* GFS2_DIF_... */ - u16 di_height; /* height of metadata */ - /* These only apply to directories */ - u16 di_depth; /* Number of bits in the table */ - u32 di_entries; /* The number of entries in the directory */ - u64 di_eattr; /* extended attribute block number */ + GIF_ORDERED = 4, + GIF_FREE_VFS_INODE = 5, }; struct gfs2_inode { struct inode i_inode; u64 i_no_addr; u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; unsigned long i_flags; /* GIF_... */ - - struct gfs2_dinode_host i_di; /* To be replaced by ref to block */ - struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_alloc *i_alloc; - u64 i_last_rg_alloc; - + struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */ + struct gfs2_rgrpd *i_rgd; + u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; + struct list_head i_trunc_list; + __be64 *i_hash_cache; + u32 i_entries; + u32 i_diskflags; + u8 i_height; + u8 i_depth; }; /* @@ -296,16 +423,20 @@ struct gfs2_revoke_replay { }; enum { - QDF_USER = 0, QDF_CHANGE = 1, QDF_LOCKED = 2, + QDF_REFRESH = 3, }; struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; struct list_head qd_list; - unsigned int qd_count; + struct kqid qd_id; + struct gfs2_sbd *qd_sbd; + struct lockref qd_lockref; + struct list_head qd_lru; + unsigned qd_hash; - u32 qd_id; unsigned long qd_flags; /* QDF_... */ s64 qd_change; @@ -323,7 +454,7 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; - unsigned long qd_last_touched; + struct rcu_head qd_rcu; }; struct gfs2_trans { @@ -332,34 +463,28 @@ struct gfs2_trans { unsigned int tr_blocks; unsigned int tr_revokes; unsigned int tr_reserved; + unsigned int tr_touched:1; + unsigned int tr_attached:1; + unsigned int tr_alloced:1; - struct gfs2_holder tr_t_gh; - - int tr_touched; - - unsigned int tr_num_buf; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; - struct list_head tr_list_buf; - unsigned int tr_num_revoke; unsigned int tr_num_revoke_rm; -}; - -struct gfs2_ail { - struct list_head ai_list; - unsigned int ai_first; - struct list_head ai_ail1_list; - struct list_head ai_ail2_list; + struct list_head tr_list; + struct list_head tr_databuf; + struct list_head tr_buf; - u64 ai_sync_gen; + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; }; struct gfs2_journal_extent { - struct list_head extent_list; + struct list_head list; unsigned int lblock; /* First logical block */ u64 dblock; /* First disk block */ @@ -369,12 +494,23 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - + unsigned int nr_extents; + struct work_struct jd_work; struct inode *jd_inode; + unsigned long jd_flags; +#define JDF_RECOVERY 1 unsigned int jd_jid; - int jd_dirty; - unsigned int jd_blocks; + int jd_recover_error; + /* Replay stuff */ + + unsigned int jd_found_blocks; + unsigned int jd_found_revokes; + unsigned int jd_replayed_blocks; + + struct list_head jd_revoke_list; + unsigned int jd_replay_tail; + }; struct gfs2_statfs_change_host { @@ -383,9 +519,6 @@ struct gfs2_statfs_change_host { s64 sc_dinodes; }; -#define GFS2_GLOCKD_DEFAULT 1 -#define GFS2_GLOCKD_MAX 16 - #define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 @@ -395,45 +528,45 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_WRITEBACK 1 #define GFS2_DATA_ORDERED 2 +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + struct gfs2_args { - char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ - char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ - char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ - int ar_spectator; /* Don't get a journal because we're always RO */ - int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ - int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ - int ar_localcaching; /* Local-style caching (dangerous on multihost) */ - int ar_debug; /* Oops on errors instead of trying to be graceful */ - int ar_upgrade; /* Upgrade ondisk/multihost format */ - unsigned int ar_num_glockd; /* Number of glockd threads */ - int ar_posix_acl; /* Enable posix acls */ - int ar_quota; /* off/account/on */ - int ar_suiddir; /* suiddir support */ - int ar_data; /* ordered/writeback */ + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ + unsigned int ar_nobarrier:1; /* do not send barriers */ + unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + int ar_commit; /* Commit interval */ + int ar_statfs_quantum; /* The fast statfs interval */ + int ar_quota_quantum; /* The quota interval */ + int ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_demote_secs; /* Cache retention for unheld glock */ - unsigned int gt_incore_log_blocks; - unsigned int gt_log_flush_secs; - - unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; - unsigned int gt_quotad_secs; - unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ unsigned int gt_quota_scale_num; /* Numerator */ unsigned int gt_quota_scale_den; /* Denominator */ - unsigned int gt_quota_cache_secs; unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ - unsigned int gt_atime_quantum; /* Min secs between atime updates */ unsigned int gt_new_files_jdata; - unsigned int gt_new_files_directio; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ - unsigned int gt_stall_secs; /* Detects trouble! */ unsigned int gt_complain_secs; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; @@ -443,7 +576,12 @@ enum { SDF_JOURNAL_CHECKED = 0, SDF_JOURNAL_LIVE = 1, SDF_SHUTDOWN = 2, - SDF_NOATIME = 3, + SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ + SDF_SKIP_DLM_UNLOCK = 8, }; #define GFS2_FSNAME_LEN 256 @@ -470,9 +608,48 @@ struct gfs2_sb_host { char sb_locktable[GFS2_LOCKNAME_LEN]; }; +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + */ + +struct lm_lockstruct { + int ls_jid; + unsigned int ls_first; + const struct lm_lockops *ls_ops; + dlm_lockspace_t *ls_dlm; + + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ +}; + +struct gfs2_pcpu_lkstats { + /* One struct for each glock type */ + struct gfs2_lkstats lkstats[10]; +}; + struct gfs2_sbd { struct super_block *sd_vfs; - struct super_block *sd_vfs_meta; + struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -488,11 +665,12 @@ struct gfs2_sbd { u32 sd_hash_bsize_shift; u32 sd_hash_ptrs; /* Number of pointers in a hash block */ u32 sd_qc_per_block; + u32 sd_blocks_per_bitmap; u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ u32 sd_max_height; /* Max height of a file's metadata tree */ - u64 sd_heightsize[GFS2_MAX_META_HEIGHT]; + u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_jheight; /* Max height of journaled file's meta tree */ - u64 sd_jheightsize[GFS2_MAX_META_HEIGHT]; + u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; struct gfs2_args sd_args; /* Mount arguments */ struct gfs2_tune sd_tune; /* Filesystem tuning structure */ @@ -500,47 +678,41 @@ struct gfs2_sbd { /* Lock Stuff */ struct lm_lockstruct sd_lockstruct; - struct list_head sd_reclaim_list; - spinlock_t sd_reclaim_lock; - wait_queue_head_t sd_reclaim_wq; - atomic_t sd_reclaim_count; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; - struct gfs2_glock *sd_trans_gl; + struct gfs2_glock *sd_freeze_gl; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; + struct completion sd_locking_init; + struct completion sd_wdack; + struct delayed_work sd_control_work; /* Inode Stuff */ - struct inode *sd_master_dir; + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + struct inode *sd_jindex; - struct inode *sd_inum_inode; struct inode *sd_statfs_inode; - struct inode *sd_ir_inode; struct inode *sd_sc_inode; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; - /* Inum stuff */ - - struct mutex sd_inum_mutex; - /* StatFS stuff */ spinlock_t sd_statfs_spin; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; - unsigned long sd_statfs_sync_time; + int sd_statfs_force_sync; /* Resource group stuff */ - u64 sd_rindex_vn; + int sd_rindex_uptodate; spinlock_t sd_rindex_spin; - struct mutex sd_rindex_mutex; - struct list_head sd_rindex_list; - struct list_head sd_rindex_mru_list; - struct list_head sd_rindex_recent_list; - struct gfs2_rgrpd *sd_rindex_forward; + struct rb_root sd_rindex_tree; unsigned int sd_rgrps; + unsigned int sd_max_rg_data; /* Journal index stuff */ @@ -548,99 +720,86 @@ struct gfs2_sbd { spinlock_t sd_jindex_spin; struct mutex sd_jindex_mutex; unsigned int sd_journals; - unsigned long sd_jindex_refresh_time; struct gfs2_jdesc *sd_jdesc; struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; - struct gfs2_holder sd_ir_gh; struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; + struct completion sd_journal_ready; + /* Daemon stuff */ - struct task_struct *sd_recoverd_process; struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; - struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX]; - unsigned int sd_glockd_num; /* Quota stuff */ struct list_head sd_quota_list; atomic_t sd_quota_count; - spinlock_t sd_quota_spin; struct mutex sd_quota_mutex; + struct mutex sd_quota_sync_mutex; + wait_queue_head_t sd_quota_wait; + struct list_head sd_trunc_list; + spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; - unsigned int sd_quota_chunks; - unsigned char **sd_quota_bitmap; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; - unsigned long sd_quota_sync_time; /* Log stuff */ + struct address_space sd_aspace; + spinlock_t sd_log_lock; + struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - unsigned int sd_log_commited_buf; - unsigned int sd_log_commited_databuf; - unsigned int sd_log_commited_revoke; + int sd_log_commited_revoke; - unsigned int sd_log_num_buf; + atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; - unsigned int sd_log_num_rg; - unsigned int sd_log_num_databuf; - struct list_head sd_log_le_buf; struct list_head sd_log_le_revoke; - struct list_head sd_log_le_rg; - struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + spinlock_t sd_ordered_lock; + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; - struct mutex sd_log_reserve_mutex; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; unsigned int sd_log_head; unsigned int sd_log_tail; int sd_log_idle; - unsigned long sd_log_flush_time; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; + struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; unsigned int sd_log_flush_head; u64 sd_log_flush_wrapped; + spinlock_t sd_ail_lock; struct list_head sd_ail1_list; struct list_head sd_ail2_list; - u64 sd_ail_sync_gen; - - /* Replay stuff */ - - struct list_head sd_revoke_list; - unsigned int sd_replay_tail; - - unsigned int sd_found_blocks; - unsigned int sd_found_revokes; - unsigned int sd_replayed_blocks; /* For quiescing the filesystem */ - struct gfs2_holder sd_freeze_gh; - struct mutex sd_freeze_lock; - unsigned int sd_freeze_count; - - /* Counters */ - - atomic_t sd_glock_count; - atomic_t sd_glock_held_count; - atomic_t sd_inode_count; - atomic_t sd_reclaimed; + struct gfs2_holder sd_freeze_root_gh; + struct gfs2_holder sd_thaw_gh; + atomic_t sd_log_freeze; + atomic_t sd_frozen_root; + wait_queue_head_t sd_frozen_root_wait; + wait_queue_head_t sd_log_frozen_wait; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; @@ -649,10 +808,24 @@ struct gfs2_sbd { /* Debugging crud */ unsigned long sd_last_warning; - struct vfsmount *sd_gfs2mnt; struct dentry *debugfs_dir; /* debugfs directory */ - struct dentry *debugfs_dentry_glocks; /* for debugfs */ + struct dentry *debugfs_dentry_glocks; + struct dentry *debugfs_dentry_glstats; + struct dentry *debugfs_dentry_sbstats; }; +static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) +{ + gl->gl_stats.stats[which]++; +} + +static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) +{ + const struct gfs2_sbd *sdp = gl->gl_sbd; + preempt_disable(); + this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; + preempt_enable(); +} + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 37725ade3c5..e62e5947788 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1,93 +1,56 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/xattr.h> #include <linux/posix_acl.h> -#include <linux/sort.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> +#include <linux/fiemap.h> #include <linux/security.h> +#include <asm/uaccess.h> #include "gfs2.h" #include "incore.h" #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" -#include "glops.h" #include "inode.h" -#include "log.h" #include "meta_io.h" -#include "ops_address.h" -#include "ops_inode.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" - -struct gfs2_inum_range_host { - u64 ir_start; - u64 ir_length; -}; - -static int iget_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - u64 *no_addr = opaque; - - if (ip->i_no_addr == *no_addr && - inode->i_private != NULL) - return 1; - - return 0; -} - -static int iget_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - u64 *no_addr = opaque; - - inode->i_ino = (unsigned long)*no_addr; - ip->i_no_addr = *no_addr; - return 0; -} - -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) -{ - unsigned long hash = (unsigned long)no_addr; - return ilookup5(sb, hash, iget_test, &no_addr); -} - -static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) -{ - unsigned long hash = (unsigned long)no_addr; - return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); -} +#include "super.h" +#include "glops.h" struct gfs2_skip_data { - u64 no_addr; - int skipped; + u64 no_addr; + int skipped; + int non_block; }; -static int iget_skip_test(struct inode *inode, void *opaque) +static int iget_test(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_skip_data *data = opaque; - if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){ - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ + if (ip->i_no_addr == data->no_addr) { + if (data->non_block && + inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { data->skipped = 1; return 0; } @@ -96,63 +59,72 @@ static int iget_skip_test(struct inode *inode, void *opaque) return 0; } -static int iget_skip_set(struct inode *inode, void *opaque) +static int iget_set(struct inode *inode, void *opaque) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_skip_data *data = opaque; if (data->skipped) - return 1; + return -ENOENT; inode->i_ino = (unsigned long)(data->no_addr); ip->i_no_addr = data->no_addr; return 0; } -static struct inode *gfs2_iget_skip(struct super_block *sb, - u64 no_addr) +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block) +{ + unsigned long hash = (unsigned long)no_addr; + struct gfs2_skip_data data; + + data.no_addr = no_addr; + data.skipped = 0; + data.non_block = non_block; + return ilookup5(sb, hash, iget_test, &data); +} + +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr, + int non_block) { struct gfs2_skip_data data; unsigned long hash = (unsigned long)no_addr; data.no_addr = no_addr; data.skipped = 0; - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data); + data.non_block = non_block; + return iget5_locked(sb, hash, iget_test, iget_set, &data); } /** - * GFS2 lookup code fills in vfs inode contents based on info obtained - * from directory entry inside gfs2_inode_lookup(). This has caused issues - * with NFS code path since its get_dentry routine doesn't have the relevant - * directory entry when gfs2_inode_lookup() is invoked. Part of the code - * segment inside gfs2_inode_lookup code needs to get moved around. + * gfs2_set_iop - Sets inode operations + * @inode: The inode with correct i_mode filled in * - * Clean up I_LOCK and I_NEW as well. - **/ + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). + */ -void gfs2_set_iop(struct inode *inode) +static void gfs2_set_iop(struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(inode); umode_t mode = inode->i_mode; if (S_ISREG(mode)) { inode->i_op = &gfs2_file_iops; - if (sdp->sd_args.ar_localflocks) + if (gfs2_localflocks(sdp)) inode->i_fop = &gfs2_file_fops_nolock; else inode->i_fop = &gfs2_file_fops; } else if (S_ISDIR(mode)) { inode->i_op = &gfs2_dir_iops; - if (sdp->sd_args.ar_localflocks) + if (gfs2_localflocks(sdp)) inode->i_fop = &gfs2_dir_fops_nolock; else inode->i_fop = &gfs2_dir_fops; } else if (S_ISLNK(mode)) { inode->i_op = &gfs2_symlink_iops; } else { - inode->i_op = &gfs2_dev_iops; + inode->i_op = &gfs2_file_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); } - - unlock_new_inode(inode); } /** @@ -160,33 +132,27 @@ void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode - * @skip_freeing: set this not return an inode if it is currently being freed. + * non_block: Can we block on inodes that are being freed? * * Returns: A VFS inode, or an error */ -struct inode *gfs2_inode_lookup(struct super_block *sb, - unsigned int type, - u64 no_addr, - u64 no_formal_ino, int skip_freeing) +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, + u64 no_addr, u64 no_formal_ino, int non_block) { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; - if (skip_freeing) - inode = gfs2_iget_skip(sb, no_addr); - else - inode = gfs2_iget(sb, no_addr); + inode = gfs2_iget(sb, no_addr, non_block); ip = GFS2_I(inode); if (!inode) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - inode->i_private = ip; ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -202,40 +168,33 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_iopen; - ip->i_iopen_gh.gh_gl->gl_object = ip; + ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; - if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) - goto gfs2_nfsbypass; - - inode->i_mode = DT2IF(type); - - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ if (type == DT_UNKNOWN) { - struct gfs2_holder gh; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (unlikely(error)) - goto fail_glock; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); + /* Inode glock must be locked already */ + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_refresh; + } else { + inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); + unlock_new_inode(inode); } -gfs2_nfsbypass: return inode; -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); + +fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); @@ -244,202 +203,56 @@ fail: return ERR_PTR(error); } -static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) { - struct gfs2_dinode_host *di = &ip->i_di; - const struct gfs2_dinode *str = buf; - - if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = be32_to_cpu(str->di_mode); - ip->i_inode.i_rdev = 0; - switch (ip->i_inode.i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); - break; - }; - - ip->i_inode.i_uid = be32_to_cpu(str->di_uid); - ip->i_inode.i_gid = be32_to_cpu(str->di_gid); - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - di->di_size = be64_to_cpu(str->di_size); - i_size_write(&ip->i_inode, di->di_size); - di->di_blocks = be64_to_cpu(str->di_blocks); - gfs2_set_inode_blocks(&ip->i_inode); - ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); - ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); - - di->di_goal_meta = be64_to_cpu(str->di_goal_meta); - di->di_goal_data = be64_to_cpu(str->di_goal_data); - di->di_generation = be64_to_cpu(str->di_generation); - - di->di_flags = be32_to_cpu(str->di_flags); - gfs2_set_inode_flags(&ip->i_inode); - di->di_height = be16_to_cpu(str->di_height); - - di->di_depth = be16_to_cpu(str->di_depth); - di->di_entries = be32_to_cpu(str->di_entries); - - di->di_eattr = be64_to_cpu(str->di_eattr); - if (S_ISREG(ip->i_inode.i_mode)) - gfs2_set_aops(&ip->i_inode); - - return 0; -} - -/** - * gfs2_inode_refresh - Refresh the incore copy of the dinode - * @ip: The GFS2 inode - * - * Returns: errno - */ - -int gfs2_inode_refresh(struct gfs2_inode *ip) -{ - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { - brelse(dibh); - return -EIO; - } - - error = gfs2_dinode_in(ip, dibh->b_data); - brelse(dibh); - clear_bit(GIF_INVALID, &ip->i_flags); - - return error; -} - -int gfs2_dinode_dealloc(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; - struct gfs2_rgrpd *rgd; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; + struct inode *inode = NULL; int error; - if (ip->i_di.di_blocks != 1) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - al = gfs2_alloc_get(ip); - - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + /* Must not read in block until block type is verified */ + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); if (error) - goto out; + return ERR_PTR(error); - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + error = gfs2_check_blk_type(sdp, no_addr, blktype); if (error) - goto out_qs; - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - if (!rgd) { - gfs2_consist_inode(ip); - error = -EIO; - goto out_rindex_relse; - } - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); - if (error) - goto out_rindex_relse; - - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1); - if (error) - goto out_rg_gunlock; + goto fail; - set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); - set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); + if (IS_ERR(inode)) + goto fail; - gfs2_free_di(rgd, ip); + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; - gfs2_trans_end(sdp); - clear_bit(GLF_STICKY, &ip->i_gl->gl_flags); - -out_rg_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); -out_rindex_relse: - gfs2_glock_dq_uninit(&al->al_ri_gh); -out_qs: - gfs2_quota_unhold(ip); -out: - gfs2_alloc_put(ip); - return error; -} - -/** - * gfs2_change_nlink - Change nlink count on inode - * @ip: The GFS2 inode - * @diff: The change in the nlink count required - * - * Returns: errno - */ -int gfs2_change_nlink(struct gfs2_inode *ip, int diff) -{ - struct buffer_head *dibh; - u32 nlink; - int error; - - BUG_ON(diff != 1 && diff != -1); - nlink = ip->i_inode.i_nlink + diff; + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; - /* If we are reducing the nlink count, but the new value ends up being - bigger than the old one, we must have underflowed. */ - if (diff < 0 && nlink > ip->i_inode.i_nlink) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; + error = 0; } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - if (diff > 0) - inc_nlink(&ip->i_inode); - else - drop_nlink(&ip->i_inode); - - ip->i_inode.i_ctime = CURRENT_TIME; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - mark_inode_dirty(&ip->i_inode); - - if (ip->i_inode.i_nlink == 0) - gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ - - return error; +fail: + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; } + struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) { struct qstr qstr; struct inode *inode; gfs2_str2qstr(&qstr, name); - inode = gfs2_lookupi(dip, &qstr, 1, NULL); + inode = gfs2_lookupi(dip, &qstr, 1); /* gfs2_lookupi has inconsistent callers: vfs * related routines expect NULL for no entry found, * gfs2_lookup_simple callers expect ENOENT @@ -468,7 +281,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) */ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root, struct nameidata *nd) + int is_root) { struct super_block *sb = dir->i_sb; struct gfs2_inode *dip = GFS2_I(dir); @@ -487,7 +300,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, return dir; } - if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) { + if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) { error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) return ERR_PTR(error); @@ -495,12 +308,12 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = permission(dir, MAY_EXEC, NULL); + error = gfs2_permission(dir, MAY_EXEC); if (error) goto out; } - inode = gfs2_dir_search(dir, name); + inode = gfs2_dir_search(dir, name, false); if (IS_ERR(inode)) error = PTR_ERR(inode); out: @@ -511,139 +324,6 @@ out: return inode ? inode : ERR_PTR(error); } -static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) -{ - const struct gfs2_inum_range *str = buf; - - ir->ir_start = be64_to_cpu(str->ir_start); - ir->ir_length = be64_to_cpu(str->ir_length); -} - -static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) -{ - struct gfs2_inum_range *str = buf; - - str->ir_start = cpu_to_be64(ir->ir_start); - str->ir_length = cpu_to_be64(ir->ir_length); -} - -static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) { - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return error; - } - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (ir.ir_length) { - *formal_ino = ir.ir_start++; - ir.ir_length--; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, - bh->b_data + sizeof(struct gfs2_dinode)); - brelse(bh); - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return 0; - } - - brelse(bh); - - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - - return 1; -} - -static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - return error; - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); - if (error) - goto out; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_end_trans; - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (!ir.ir_length) { - struct buffer_head *m_bh; - u64 x, y; - __be64 z; - - error = gfs2_meta_inode_buffer(m_ip, &m_bh); - if (error) - goto out_brelse; - - z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)); - x = y = be64_to_cpu(z); - ir.ir_start = x; - ir.ir_length = GFS2_INUM_QUANTUM; - x += GFS2_INUM_QUANTUM; - if (x < y) - gfs2_consist_inode(m_ip); - z = cpu_to_be64(x); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z; - - brelse(m_bh); - } - - *formal_ino = ir.ir_start++; - ir.ir_length--; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - -out_brelse: - brelse(bh); -out_end_trans: - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); - return error; -} - -static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) -{ - int error; - - error = pick_formal_ino_1(sdp, inum); - if (error <= 0) - return error; - - error = pick_formal_ino_2(sdp, inum); - - return error; -} - /** * create_ok - OK to create a new on-disk inode here? * @dip: Directory in which dinode is to be created @@ -654,30 +334,19 @@ static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) */ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, - unsigned int mode) + umode_t mode) { int error; - error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; /* Don't create entries in an unlinked directory */ if (!dip->i_inode.i_nlink) - return -EPERM; + return -ENOENT; - error = gfs2_dir_check(&dip->i_inode, name, NULL); - switch (error) { - case -ENOENT: - error = 0; - break; - case 0: - return -EEXIST; - default: - return error; - } - - if (dip->i_di.di_entries == (u32)-1) + if (dip->i_entries == (u32)-1) return -EFBIG; if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) return -EMLINK; @@ -685,197 +354,186 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, - unsigned int *uid, unsigned int *gid) +static void munge_mode_uid_gid(const struct gfs2_inode *dip, + struct inode *inode) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && - (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { - if (S_ISDIR(*mode)) - *mode |= S_ISUID; - else if (dip->i_inode.i_uid != current->fsuid) - *mode &= ~07111; - *uid = dip->i_inode.i_uid; + (dip->i_inode.i_mode & S_ISUID) && + !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) { + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISUID; + else if (!uid_eq(dip->i_inode.i_uid, current_fsuid())) + inode->i_mode &= ~07111; + inode->i_uid = dip->i_inode.i_uid; } else - *uid = current->fsuid; + inode->i_uid = current_fsuid(); if (dip->i_inode.i_mode & S_ISGID) { - if (S_ISDIR(*mode)) - *mode |= S_ISGID; - *gid = dip->i_inode.i_gid; + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + inode->i_gid = dip->i_inode.i_gid; } else - *gid = current->fsgid; + inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) +static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - if (gfs2_alloc_get(dip) == NULL) - return -ENOMEM; - - dip->i_alloc->al_requested = RES_DINODE; - error = gfs2_inplace_reserve(dip); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; - *no_addr = gfs2_alloc_di(dip, generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + ip->i_no_formal_ino = ip->i_generation; + ip->i_inode.i_ino = ip->i_no_addr; + ip->i_goal = ip->i_no_addr; gfs2_trans_end(sdp); out_ipreserv: - gfs2_inplace_release(dip); + gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); out: - gfs2_alloc_put(dip); return error; } +static void gfs2_init_dir(struct buffer_head *dibh, + const struct gfs2_inode *parent) +{ + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_inum_out(parent, dent); + dent->de_type = cpu_to_be16(DT_DIR); + +} + +/** + * gfs2_init_xattr - Initialise an xattr block for a new inode + * @ip: The inode in question + * + * This sets up an empty xattr block for a new inode, ready to + * take any ACLs, LSM xattrs, etc. + */ + +static void gfs2_init_xattr(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + struct gfs2_ea_header *ea; + + bh = gfs2_meta_new(ip->i_gl, ip->i_eattr); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(bh); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + + brelse(bh); +} + /** * init_dinode - Fill in a new dinode structure - * @dip: the directory this inode is being created in - * @gl: The glock covering the new inode - * @inum: the inode number - * @mode: the file permissions - * @uid: - * @gid: + * @dip: The directory this inode is being created in + * @ip: The inode + * @symname: The symlink destination (if a symlink) + * @bhp: The buffer head (returned to caller) * */ -static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - const struct gfs2_inum_host *inum, unsigned int mode, - unsigned int uid, unsigned int gid, - const u64 *generation, dev_t dev, struct buffer_head **bhp) +static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; - struct timespec tv = CURRENT_TIME; - dibh = gfs2_meta_new(gl, inum->no_addr); - gfs2_trans_add_bh(gl, dibh, 1); - gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); + gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); - di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); - di->di_num.no_addr = cpu_to_be64(inum->no_addr); - di->di_mode = cpu_to_be32(mode); - di->di_uid = cpu_to_be32(uid); - di->di_gid = cpu_to_be32(gid); - di->di_nlink = 0; - di->di_size = 0; - di->di_blocks = cpu_to_be64(1); - di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); - di->di_major = cpu_to_be32(MAJOR(dev)); - di->di_minor = cpu_to_be32(MINOR(dev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); - di->di_generation = cpu_to_be64(*generation); - di->di_flags = 0; - - if (S_ISREG(mode)) { - if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || - gfs2_tune_get(sdp, gt_new_files_jdata)) - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) || - gfs2_tune_get(sdp, gt_new_files_directio)) - di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO); - } else if (S_ISDIR(mode)) { - di->di_flags |= cpu_to_be32(dip->i_di.di_flags & - GFS2_DIF_INHERIT_DIRECTIO); - di->di_flags |= cpu_to_be32(dip->i_di.di_flags & - GFS2_DIF_INHERIT_JDATA); - } - + di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); + di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); - di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; - di->di_depth = 0; - di->di_entries = 0; memset(&di->__pad4, 0, sizeof(di->__pad4)); - di->di_eattr = 0; - di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); - - set_buffer_uptodate(dibh); - - *bhp = dibh; -} - -static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - unsigned int mode, const struct gfs2_inum_host *inum, - const u64 *generation, dev_t dev, struct buffer_head **bhp) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - unsigned int uid, gid; - int error; - - munge_mode_uid_gid(dip, &mode, &uid, &gid); - gfs2_alloc_get(dip); - - error = gfs2_quota_lock(dip, uid, gid); - if (error) - goto out; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - error = gfs2_quota_check(dip, uid, gid); - if (error) - goto out_quota; + switch(ip->i_inode.i_mode & S_IFMT) { + case S_IFDIR: + gfs2_init_dir(dibh, dip); + break; + case S_IFLNK: + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size); + break; + } - error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); - if (error) - goto out_quota; + set_buffer_uptodate(dibh); + brelse(dibh); +} - init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp); - gfs2_quota_change(dip, +1, uid, gid); - gfs2_trans_end(sdp); +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ -out_quota: - gfs2_quota_unlock(dip); -out: - gfs2_alloc_put(dip); - return error; +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; } static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) + struct gfs2_inode *ip, struct gfs2_diradd *da) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al; - int alloc_required; - struct buffer_head *dibh; + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; int error; - al = gfs2_alloc_get(dip); - - error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto fail; - - error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); - if (alloc_required < 0) - goto fail_quota_locks; - if (alloc_required) { - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); + if (da->nr_blocks) { + error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, &ap); if (error) goto fail_quota_locks; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + - 2 * RES_DINODE + - RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); if (error) goto fail_ipreserv; } else { @@ -884,100 +542,78 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode)); - if (error) - goto fail_end_trans; - - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_dir_add(&dip->i_inode, name, ip, da); if (error) goto fail_end_trans; - ip->i_inode.i_nlink = 1; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - return 0; fail_end_trans: gfs2_trans_end(sdp); - fail_ipreserv: - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); - -fail: - gfs2_alloc_put(dip); return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) +static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { - int err; - size_t len; - void *value; - char *name; - struct gfs2_ea_request er; - - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, - &name, &value, &len); - - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; } - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - - er.er_type = GFS2_EATYPE_SECURITY; - er.er_name = name; - er.er_data = value; - er.er_name_len = strlen(name); - er.er_data_len = len; - - err = gfs2_ea_set_i(ip, &er); - - kfree(value); - kfree(name); - return err; } /** - * gfs2_createi - Create a new inode - * @ghs: An array of two holders - * @name: The name of the new file - * @mode: the permissions on the new inode - * - * @ghs[0] is an initialized holder for the directory - * @ghs[1] is the holder for the inode lock + * gfs2_create_inode - Create a new inode + * @dir: The parent directory + * @dentry: The new dentry + * @file: If non-NULL, the file which is being opened + * @mode: The permissions on the new inode + * @dev: For device nodes, this is the device number + * @symname: For symlinks, this is the link destination + * @size: The initial size of the inode (ignored for directories) * - * If the return value is not NULL, the glocks on both the directory and the new - * file are held. A transaction has been started and an inplace reservation - * is held, as well. - * - * Returns: An inode + * Returns: 0 on success, or error code */ -struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, - unsigned int mode, dev_t dev) +static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + struct file *file, + umode_t mode, dev_t dev, const char *symname, + unsigned int size, int excl, int *opened) { + const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; + struct gfs2_holder ghs[2]; struct inode *inode = NULL; - struct gfs2_inode *dip = ghs->gh_gl->gl_object; - struct inode *dir = &dip->i_inode; + struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; - int error; - u64 generation; - struct buffer_head *bh = NULL; + struct gfs2_glock *io_gl; + struct dentry *d; + int error, free_vfs_inode = 0; + u32 aflags = 0; + unsigned blocks = 1; + struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) - return ERR_PTR(-ENAMETOOLONG); + return -ENAMETOOLONG; + + error = gfs2_rs_alloc(dip); + if (error) + return error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; - gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); - error = gfs2_glock_nq(ghs); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; @@ -985,113 +621,392 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = pick_formal_ino(sdp, &inum.no_formal_ino); - if (error) + inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); + error = PTR_ERR(inode); + if (!IS_ERR(inode)) { + d = d_splice_alias(inode, dentry); + error = PTR_ERR(d); + if (IS_ERR(d)) + goto fail_gunlock; + error = 0; + if (file) { + if (S_ISREG(inode->i_mode)) { + WARN_ON(d != NULL); + error = finish_open(file, dentry, gfs2_open_common, opened); + } else { + error = finish_no_open(file, d); + } + } else { + dput(d); + } + gfs2_glock_dq_uninit(ghs); + return error; + } else if (error != -ENOENT) { goto fail_gunlock; + } - error = alloc_dinode(dip, &inum.no_addr, &generation); - if (error) + error = gfs2_diradd_alloc_required(dir, name, &da); + if (error < 0) goto fail_gunlock; - error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, - LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); - if (error) + inode = new_inode(sdp->sd_vfs); + error = -ENOMEM; + if (!inode) goto fail_gunlock; - error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh); + error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) - goto fail_gunlock2; + goto fail_free_vfs_inode; - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), - inum.no_addr, - inum.no_formal_ino, 0); - if (IS_ERR(inode)) - goto fail_gunlock2; + ip = GFS2_I(inode); + error = gfs2_rs_alloc(ip); + if (error) + goto fail_free_acls; + + inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); + inode->i_rdev = dev; + inode->i_size = size; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + gfs2_set_inode_blocks(inode, 1); + munge_mode_uid_gid(dip, inode); + ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + gfs2_set_inode_flags(inode); - error = gfs2_inode_refresh(GFS2_I(inode)); + if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + (dip->i_diskflags & GFS2_DIF_TOPDIR)) + aflags |= GFS2_AF_ORLOV; + + if (default_acl || acl) + blocks++; + + error = alloc_dinode(ip, aflags, &blocks); if (error) - goto fail_gunlock2; + goto fail_free_inode; - error = gfs2_acl_create(dip, GFS2_I(inode)); + gfs2_set_inode_blocks(inode, blocks); + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (error) + goto fail_free_inode; + + ip->i_gl->gl_object = ip; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_free_inode; + + error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock2; - error = gfs2_security_init(dip, GFS2_I(inode)); + if (blocks > 1) { + ip->i_eattr = ip->i_no_addr + 1; + gfs2_init_xattr(ip); + } + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_gunlock2; - error = link_dinode(dip, name, GFS2_I(inode)); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; - if (bh) - brelse(bh); - if (!inode) - return ERR_PTR(-ENOMEM); - return inode; + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + gfs2_set_iop(inode); + insert_inode_hash(inode); + + if (default_acl) { + error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + + if (error) + goto fail_gunlock3; + + error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, + &gfs2_initxattrs, NULL); + if (error) + goto fail_gunlock3; + + error = link_dinode(dip, name, ip, &da); + if (error) + goto fail_gunlock3; + + mark_inode_dirty(inode); + d_instantiate(dentry, inode); + if (file) { + *opened |= FILE_CREATED; + error = finish_open(file, dentry, gfs2_open_common, opened); + } + gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(ghs + 1); + return error; + +fail_gunlock3: + gfs2_glock_dq_uninit(ghs + 1); + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + goto fail_gunlock; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode) - iput(inode); +fail_free_inode: + if (ip->i_gl) + gfs2_glock_put(ip->i_gl); + gfs2_rs_delete(ip, NULL); +fail_free_acls: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); +fail_free_vfs_inode: + free_vfs_inode = 1; fail_gunlock: - gfs2_glock_dq(ghs); + gfs2_dir_no_add(&da); + gfs2_glock_dq_uninit(ghs); + if (inode && !IS_ERR(inode)) { + clear_nlink(inode); + if (!free_vfs_inode) + mark_inode_dirty(inode); + set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, + &GFS2_I(inode)->i_flags); + iput(inode); + } fail: - if (bh) - brelse(bh); - return ERR_PTR(error); + return error; +} + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL); } /** - * gfs2_rmdiri - Remove a directory - * @dip: The parent directory of the directory to be removed - * @name: The name of the directory to be removed - * @ip: The GFS2 inode of the directory to be removed + * __gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @file: File to be opened + * @opened: atomic_open flags * - * Assumes Glocks on dip and ip are held * * Returns: errno */ -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) +static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct file *file, int *opened) { - struct qstr dotname; + struct inode *inode; + struct dentry *d; + struct gfs2_holder gh; + struct gfs2_glock *gl; int error; - if (ip->i_di.di_entries != 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + gl = GFS2_I(inode)->i_gl; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + + d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + iput(inode); + gfs2_glock_dq_uninit(&gh); + return d; + } + if (file && S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common, opened); + + gfs2_glock_dq_uninit(&gh); + if (error) { + dput(d); + return ERR_PTR(error); } + return d; +} - error = gfs2_dir_del(dip, name); +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned flags) +{ + return __gfs2_lookup(dir, dentry, NULL, NULL); +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = old_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + struct buffer_head *dibh; + struct gfs2_diradd da = { .bh = NULL, }; + int error; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + error = gfs2_rs_alloc(dip); if (error) return error; - error = gfs2_change_nlink(dip, -1); + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq(ghs); /* parent */ if (error) - return error; + goto out_parent; - gfs2_str2qstr(&dotname, "."); - error = gfs2_dir_del(ip, &dotname); + error = gfs2_glock_nq(ghs + 1); /* child */ if (error) - return error; + goto out_child; - gfs2_str2qstr(&dotname, ".."); - error = gfs2_dir_del(ip, &dotname); + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_gunlock; + + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); if (error) - return error; + goto out_gunlock; + + error = gfs2_dir_check(dir, &dentry->d_name, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } - /* It looks odd, but it really should be done twice */ - error = gfs2_change_nlink(ip, -1); + error = -EINVAL; + if (!dip->i_inode.i_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_inode.i_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_inode.i_nlink == (u32)-1) + goto out_gunlock; + + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); + if (error < 0) + goto out_gunlock; + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(dip); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(dip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - return error; + goto out_end_trans; - error = gfs2_change_nlink(ip, -1); + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); if (error) - return error; + goto out_brelse; + + gfs2_trans_add_meta(ip->i_gl, dibh); + inc_nlink(&ip->i_inode); + ip->i_inode.i_ctime = CURRENT_TIME; + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); +out_brelse: + brelse(dibh); +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (da.nr_blocks) + gfs2_inplace_release(dip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(dip); +out_gunlock: + gfs2_dir_no_add(&da); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); return error; } @@ -1106,8 +1021,8 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, * Returns: 0 if the parent/child relationship is correct, errno if it isn't */ -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip) +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) { int error; @@ -1115,14 +1030,14 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return -EPERM; if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current->fsuid && - ip->i_inode.i_uid != current->fsuid && !capable(CAP_FOWNER)) + !uid_eq(dip->i_inode.i_uid, current_fsuid()) && + !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER)) return -EPERM; if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1133,6 +1048,221 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } +/** + * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it + * @dip: The parent directory + * @name: The name of the entry in the parent directory + * @inode: The inode to be removed + * + * Called with all the locks and in a transaction. This will only be + * called for a directory after it has been checked to ensure it is empty. + * + * Returns: 0 on success, or an error + */ + +static int gfs2_unlink_inode(struct gfs2_inode *dip, + const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = gfs2_dir_del(dip, dentry); + if (error) + return error; + + ip->i_entries = 0; + inode->i_ctime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else + drop_nlink(inode); + mark_inode_dirty(inode); + if (inode->i_nlink == 0) + gfs2_unlink_di(inode); + return 0; +} + + +/** + * gfs2_unlink - Unlink an inode (this does rmdir as well) + * @dir: The inode of the directory containing the inode to unlink + * @dentry: The file itself + * + * This routine uses the type of the inode as a flag to figure out + * whether this is an unlink or an rmdir. + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = -EROFS; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) + goto out_inodes; + + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_rgrp; + + if (S_ISDIR(inode->i_mode)) { + error = -ENOTEMPTY; + if (ip->i_entries > 2 || inode->i_nlink > 2) + goto out_rgrp; + } + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_end_trans; + + error = gfs2_unlink_inode(dip, dentry); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq(ghs + 2); +out_rgrp: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs + 2); +out_inodes: + gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(ghs); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned int size; + + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @dev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t dev) +{ + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL); +} + +/** + * gfs2_atomic_open - Atomically open a file + * @dir: The directory + * @dentry: The proposed new entry + * @file: The proposed new struct file + * @flags: open flags + * @mode: File mode + * @opened: Flag to say whether the file has been opened or not + * + * Returns: error code or 0 for success + */ + +static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + struct dentry *d; + bool excl = !!(flags & O_EXCL); + + d = __gfs2_lookup(dir, dentry, file, opened); + if (IS_ERR(d)) + return PTR_ERR(d); + if (d != NULL) + dentry = d; + if (dentry->d_inode) { + if (!(*opened & FILE_OPENED)) { + if (d == NULL) + dget(dentry); + return finish_no_open(file, dentry); + } + dput(d); + return 0; + } + + BUG_ON(d != NULL); + if (!(flags & O_CREAT)) + return -ENOENT; + + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened); +} + /* * gfs2_ok_to_move - check if it's ok to move a directory to another directory * @this: move this @@ -1144,16 +1274,13 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, * Returns: errno */ -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) { struct inode *dir = &to->i_inode; struct super_block *sb = dir->i_sb; struct inode *tmp; - struct qstr dotdot; int error = 0; - gfs2_str2qstr(&dotdot, ".."); - igrab(dir); for (;;) { @@ -1166,7 +1293,11 @@ int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) break; } - tmp = gfs2_lookupi(dir, &dotdot, 1, NULL); + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (!tmp) { + error = -ENOENT; + break; + } if (IS_ERR(tmp)) { error = PTR_ERR(tmp); break; @@ -1182,258 +1313,685 @@ int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) } /** - * gfs2_readlinki - return the contents of a symlink - * @ip: the symlink's inode - * @buf: a pointer to the buffer to be filled - * @len: a pointer to the length of @buf - * - * If @buf is too small, a piece of memory is kmalloc()ed and needs - * to be freed by the caller. + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file * * Returns: errno */ -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_rgrpd *nrgd; + unsigned int num_gh; + int dir_rename = 0; + struct gfs2_diradd da = { .nr_blocks = 0, }; + unsigned int x; + int error; + + if (ndentry->d_inode) { + nip = GFS2_I(ndentry->d_inode); + if (ip == nip) + return 0; + } + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_rs_alloc(ndip); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); + } + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (ip->i_inode.i_nlink == 0) + goto out_gunlock; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (nip->i_inode.i_nlink == 0) { + error = -EAGAIN; + goto out_gunlock; + } + + if (S_ISDIR(nip->i_inode.i_mode)) { + if (nip->i_entries < 2) { + gfs2_consist_inode(nip); + error = -EIO; + goto out_gunlock; + } + if (nip->i_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_inode.i_nlink) { + error = -ENOENT; + goto out_gunlock; + } + if (ndip->i_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_inode.i_mode) && + ndip->i_inode.i_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = gfs2_permission(odentry->d_inode, MAY_WRITE); + if (error) + goto out_gunlock; + } + + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(ndip); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(ndip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF + 4, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) + error = gfs2_unlink_inode(ndip, ndentry); + + if (dir_rename) { + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, odentry); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (da.nr_blocks) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(ndip); +out_gunlock: + gfs2_dir_no_add(&da); + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_holder i_gh; struct buffer_head *dibh; - unsigned int x; + unsigned int size; + char *buf; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); - return error; + nd_set_link(nd, ERR_PTR(error)); + return NULL; } - if (!ip->i_di.di_size) { + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { gfs2_consist_inode(ip); - error = -EIO; + buf = ERR_PTR(-EIO); goto out; } error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) + if (error) { + buf = ERR_PTR(error); goto out; - - x = ip->i_di.di_size + 1; - if (x > *len) { - *buf = kmalloc(x, GFP_KERNEL); - if (!*buf) { - error = -ENOMEM; - goto out_brelse; - } } - memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); - *len = x; - -out_brelse: + buf = kzalloc(size + 1, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); brelse(dibh); out: gfs2_glock_dq_uninit(&i_gh); - return error; + nd_set_link(nd, buf); + return NULL; } /** - * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and - * conditionally update the inode's atime - * @gh: the holder to acquire + * gfs2_permission - + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not * - * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap - * Update if the difference between the current time and the inode's current - * atime is greater than an interval specified at mount. + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. * * Returns: errno */ -int gfs2_glock_nq_atime(struct gfs2_holder *gh) +int gfs2_permission(struct inode *inode, int mask) { - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = gl->gl_object; - s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum); - unsigned int state; - int flags; + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - struct timespec tv = CURRENT_TIME; + int unlock = 0; + int frozen_root = 0; - if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || - gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || - gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops)) - return -EINVAL; - state = gh->gh_state; - flags = gh->gh_flags; + ip = GFS2_I(inode); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } + } + + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EACCES; + else + error = generic_permission(inode, mask); + if (unlock) + gfs2_glock_dq_uninit(&i_gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); + + return error; +} + +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} - error = gfs2_glock_nq(gh); +/** + * gfs2_setattr_simple - + * @ip: + * @attr: + * + * Returns: errno + */ + +int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(inode, attr); + + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); if (error) return error; - if (test_bit(SDF_NOATIME, &sdp->sd_flags) || - (sdp->sd_vfs->s_flags & MS_RDONLY)) - return 0; - - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, - gh); - error = gfs2_glock_nq(gh); - if (error) - return error; + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); + return error; +} - /* Verify that atime hasn't been updated while we were - trying to get exclusive lock. */ +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + kuid_t ouid, nuid; + kgid_t ogid, ngid; + int error; - tv = CURRENT_TIME; - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - struct buffer_head *dibh; - struct gfs2_dinode *di; + ouid = inode->i_uid; + ogid = inode->i_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error == -EROFS) - return 0; - if (error) - goto fail; + if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) + ouid = nuid = NO_UID_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) + ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; + error = get_write_access(inode); + if (error) + return error; - ip->i_inode.i_atime = tv; + error = gfs2_rs_alloc(ip); + if (error) + goto out; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - brelse(dibh); + error = gfs2_rindex_update(sdp); + if (error) + goto out; - gfs2_trans_end(sdp); - } + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; - /* If someone else has asked for the glock, - unlock and let them have it. Then reacquire - in the original state. */ - if (gfs2_glock_is_blocking(gl)) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(state, flags, gh); - return gfs2_glock_nq(gh); - } + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + error = gfs2_quota_check(ip, nuid, ngid); + if (error) + goto out_gunlock_q; } - return 0; + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; -fail_end_trans: + error = gfs2_setattr_simple(inode, attr); + if (error) + goto out_end_trans; + + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); + gfs2_quota_change(ip, -blocks, ouid, ogid); + gfs2_quota_change(ip, blocks, nuid, ngid); + } + +out_end_trans: gfs2_trans_end(sdp); -fail: - gfs2_glock_dq(gh); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } -static int -__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) { - struct buffer_head *dibh; + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; int error; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - error = inode_setattr(&ip->i_inode, attr); - gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); + error = gfs2_rs_alloc(ip); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = gfs2_setattr_size(inode, attr->ia_size); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else { + error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); } + +out: + if (!error) + mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); return error; } /** - * gfs2_setattr_simple - - * @ip: - * @attr: + * gfs2_getattr - Read out an inode's attributes + * @mnt: The vfsmount the inode is being accessed from + * @dentry: The dentry to stat + * @stat: The inode's stats * - * Called with a reference on the vnode. + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. Note that its the NFS + * readdirplus operation which causes this to be called (from filldir) + * with the glock already held. * * Returns: errno */ -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; + int unlock = 0; + int frozen_root = 0; + + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } + } - if (current->journal_info) - return __gfs2_setattr_simple(ip, attr); + generic_fillattr(inode, stat); + if (unlock) + gfs2_glock_dq_uninit(&gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); - if (error) - return error; + return 0; +} - error = __gfs2_setattr_simple(ip, attr); - gfs2_trans_end(GFS2_SB(&ip->i_inode)); - return error; +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) { - const struct gfs2_dinode_host *di = &ip->i_di; - struct gfs2_dinode *str = buf; - - str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); - str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); - str->di_header.__pad0 = 0; - str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); - str->di_header.__pad1 = 0; - str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(ip->i_inode.i_uid); - str->di_gid = cpu_to_be32(ip->i_inode.i_gid); - str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(di->di_size); - str->di_blocks = cpu_to_be64(di->di_blocks); - str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); - - str->di_goal_meta = cpu_to_be64(di->di_goal_meta); - str->di_goal_data = cpu_to_be64(di->di_goal_data); - str->di_generation = cpu_to_be64(di->di_generation); - - str->di_flags = cpu_to_be32(di->di_flags); - str->di_height = cpu_to_be16(di->di_height); - str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && - !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? - GFS2_FORMAT_DE : 0); - str->di_depth = cpu_to_be16(di->di_depth); - str->di_entries = cpu_to_be32(di->di_entries); - - str->di_eattr = cpu_to_be64(di->di_eattr); - str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } -void gfs2_dinode_print(const struct gfs2_inode *ip) +static int gfs2_removexattr(struct dentry *dentry, const char *name) { - const struct gfs2_dinode_host *di = &ip->i_di; - - printk(KERN_INFO " no_formal_ino = %llu\n", - (unsigned long long)ip->i_no_formal_ino); - printk(KERN_INFO " no_addr = %llu\n", - (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); - printk(KERN_INFO " di_blocks = %llu\n", - (unsigned long long)di->di_blocks); - printk(KERN_INFO " di_goal_meta = %llu\n", - (unsigned long long)di->di_goal_meta); - printk(KERN_INFO " di_goal_data = %llu\n", - (unsigned long long)di->di_goal_data); - printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); - printk(KERN_INFO " di_height = %u\n", di->di_height); - printk(KERN_INFO " di_depth = %u\n", di->di_depth); - printk(KERN_INFO " di_entries = %u\n", di->di_entries); - printk(KERN_INFO " di_eattr = %llu\n", - (unsigned long long)di->di_eattr); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = gfs2_rs_alloc(ip); + if (ret == 0) + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; } +const struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, + .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, +}; + +const struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_unlink, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, + .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, + .atomic_open = gfs2_atomic_open, +}; + +const struct inode_operations gfs2_symlink_iops = { + .readlink = generic_readlink, + .follow_link = gfs2_follow_link, + .put_link = kfree_put_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index d4465066261..ba4d9492d42 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -10,14 +10,24 @@ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include "util.h" + +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); + static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { - return !ip->i_di.di_height; + return !ip->i_height; } static inline int gfs2_is_jdata(const struct gfs2_inode *ip) { - return ip->i_di.di_flags & GFS2_DIF_JDATA; + return ip->i_diskflags & GFS2_DIF_JDATA; } static inline int gfs2_is_writeback(const struct gfs2_inode *ip) @@ -37,13 +47,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip) return S_ISDIR(ip->i_inode.i_mode); } -static inline void gfs2_set_inode_blocks(struct inode *inode) +static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) +{ + inode->i_blocks = blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { - struct gfs2_inode *ip = GFS2_I(inode); - inode->i_blocks = ip->i_di.di_blocks << + return inode->i_blocks >> (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); } +static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) +{ + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); + change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + inode->i_blocks += change; +} + static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, u64 no_formal_ino) { @@ -57,33 +79,63 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); } +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & ((1 << inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} -void gfs2_inode_attr_in(struct gfs2_inode *ip); -void gfs2_set_iop(struct inode *inode); -struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); - -int gfs2_inode_refresh(struct gfs2_inode *ip); - -int gfs2_dinode_dealloc(struct gfs2_inode *inode); -int gfs2_change_nlink(struct gfs2_inode *ip, int diff); -struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root, struct nameidata *nd); -struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, - unsigned int mode, dev_t dev); -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip); -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); -int gfs2_glock_nq_atime(struct gfs2_holder *gh); -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); -struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -void gfs2_dinode_print(const struct gfs2_inode *ip); +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + int non_block); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); + +extern int gfs2_inode_refresh(struct gfs2_inode *ip); + +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern int gfs2_open_common(struct inode *inode, struct file *file); + +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_localflocks; +} +#else /* Single node only */ +#define gfs2_file_fops gfs2_file_fops_nolock +#define gfs2_dir_fops gfs2_dir_fops_nolock + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return 1; +} +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ #endif /* __INODE_DOT_H__ */ diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c deleted file mode 100644 index cfcc39b86a5..00000000000 --- a/fs/gfs2/lm.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> - -#include "gfs2.h" -#include "incore.h" -#include "glock.h" -#include "lm.h" -#include "super.h" -#include "util.h" - -/** - * gfs2_lm_mount - mount a locking protocol - * @sdp: the filesystem - * @args: mount arguements - * @silent: if 1, don't complain if the FS isn't a GFS2 fs - * - * Returns: errno - */ - -int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) -{ - char *proto = sdp->sd_proto_name; - char *table = sdp->sd_table_name; - int flags = 0; - int error; - - if (sdp->sd_args.ar_spectator) - flags |= LM_MFLAG_SPECTATOR; - - fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); - - error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, - gfs2_glock_cb, sdp, - GFS2_MIN_LVB_SIZE, flags, - &sdp->sd_lockstruct, &sdp->sd_kobj); - if (error) { - fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", - proto, table, sdp->sd_args.ar_hostdata); - goto out; - } - - if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || - gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= - GFS2_MIN_LVB_SIZE)) { - gfs2_unmount_lockproto(&sdp->sd_lockstruct); - goto out; - } - - if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); - else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, - sdp->sd_lockstruct.ls_jid); - - fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - - if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && - !sdp->sd_args.ar_ignore_local_fs) { - sdp->sd_args.ar_localflocks = 1; - sdp->sd_args.ar_localcaching = 1; - } - -out: - return error; -} - -void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_others_may_mount( - sdp->sd_lockstruct.ls_lockspace); -} - -void gfs2_lm_unmount(struct gfs2_sbd *sdp) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - gfs2_unmount_lockproto(&sdp->sd_lockstruct); -} - -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) -{ - va_list args; - - if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return 0; - - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - fs_err(sdp, "about to withdraw this file system\n"); - BUG_ON(sdp->sd_args.ar_debug); - - fs_err(sdp, "telling LM to withdraw\n"); - gfs2_withdraw_lockproto(&sdp->sd_lockstruct); - fs_err(sdp, "withdrawn\n"); - dump_stack(); - - return -1; -} - -int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, - void **lockp) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_get_lock( - sdp->sd_lockstruct.ls_lockspace, name, lockp); - return error; -} - -void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_put_lock(lock); -} - -unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, - unsigned int flags) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, - req_state, flags); - return ret; -} - -unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state) -{ - int ret = 0; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); - return ret; -} - -void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_cancel(lock); -} - -int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); - return error; -} - -void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb); -} - -int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock_get( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; -} - -int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_plock( - sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl); - return error; -} - -int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error = -EIO; - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - error = sdp->sd_lockstruct.ls_ops->lm_punlock( - sdp->sd_lockstruct.ls_lockspace, name, file, fl); - return error; -} - -void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, - unsigned int message) -{ - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - sdp->sd_lockstruct.ls_ops->lm_recovery_done( - sdp->sd_lockstruct.ls_lockspace, jid, message); -} - diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h deleted file mode 100644 index 21cdc30ee08..00000000000 --- a/fs/gfs2/lm.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __LM_DOT_H__ -#define __LM_DOT_H__ - -struct gfs2_sbd; - -#define GFS2_MIN_LVB_SIZE 32 - -int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent); -void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp); -void gfs2_lm_unmount(struct gfs2_sbd *sdp); -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) - __attribute__ ((format(printf, 2, 3))); -int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, - void **lockp); -void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock); -unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state, unsigned int req_state, - unsigned int flags); -unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, - unsigned int cur_state); -void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock); -int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp); -void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb); -int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl); -int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl); -int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, - struct file *file, struct file_lock *fl); -void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, - unsigned int message); - -#endif /* __LM_DOT_H__ */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c new file mode 100644 index 00000000000..4fafea1c9ec --- /dev/null +++ b/fs/gfs2/lock_dlm.c @@ -0,0 +1,1342 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/fs.h> +#include <linux/dlm.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> + +#include "incore.h" +#include "glock.h" +#include "util.h" +#include "sys.h" +#include "trace_gfs2.h" + +extern struct workqueue_struct *gfs2_control_wq; + +/** + * gfs2_update_stats - Update time based stats + * @mv: Pointer to mean/variance structure to update + * @sample: New data to include + * + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The varience estimate is a bit + * more complicated. We subtract the abs value of the @delta from + * the current variance estimate and add 1/4 of that to the running + * total. + * + * Note that the index points at the array entry containing the smoothed + * mean value, and the variance is always in the following entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP case, + * they are not scaled fixed point. + */ + +static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, + s64 sample) +{ + s64 delta = sample - s->stats[index]; + s->stats[index] += (delta >> 3); + index++; + s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2); +} + +/** + * gfs2_update_reply_times - Update locking statistics + * @gl: The glock to update + * + * This assumes that gl->gl_dstamp has been set earlier. + * + * The rtt (lock round trip time) is an estimate of the time + * taken to perform a dlm lock request. We update it on each + * reply from the dlm. + * + * The blocking flag is set on the glock for all dlm requests + * which may potentially block due to lock requests from other nodes. + * DLM requests where the current lock state is exclusive, the + * requested state is null (or unlocked) or where the TRY or + * TRY_1CB flags are set are classified as non-blocking. All + * other DLM requests are counted as (potentially) blocking. + */ +static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? + GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + s64 rtt; + + preempt_disable(); + rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */ + preempt_enable(); + + trace_gfs2_glock_lock_time(gl, rtt); +} + +/** + * gfs2_update_request_times - Update locking statistics + * @gl: The glock to update + * + * The irt (lock inter-request times) measures the average time + * between requests to the dlm. It is updated immediately before + * each dlm call. + */ + +static inline void gfs2_update_request_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + ktime_t dstamp; + s64 irt; + + preempt_disable(); + dstamp = gl->gl_dstamp; + gl->gl_dstamp = ktime_get_real(); + irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */ + preempt_enable(); +} + +static void gdlm_ast(void *arg) +{ + struct gfs2_glock *gl = arg; + unsigned ret = gl->gl_state; + + gfs2_update_reply_times(gl); + BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); + + if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) + memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + switch (gl->gl_lksb.sb_status) { + case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + gfs2_glock_free(gl); + return; + case -DLM_ECANCEL: /* Cancel while getting lock */ + ret |= LM_OUT_CANCELED; + goto out; + case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ + goto out; + case -ETIMEDOUT: /* Canceled due to timeout */ + ret |= LM_OUT_ERROR; + goto out; + case 0: /* Success */ + break; + default: /* Something unexpected */ + BUG(); + } + + ret = gl->gl_req; + if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { + if (gl->gl_req == LM_ST_SHARED) + ret = LM_ST_DEFERRED; + else if (gl->gl_req == LM_ST_DEFERRED) + ret = LM_ST_SHARED; + else + BUG(); + } + + set_bit(GLF_INITIAL, &gl->gl_flags); + gfs2_glock_complete(gl, ret); + return; +out: + if (!test_bit(GLF_INITIAL, &gl->gl_flags)) + gl->gl_lksb.sb_lkid = 0; + gfs2_glock_complete(gl, ret); +} + +static void gdlm_bast(void *arg, int mode) +{ + struct gfs2_glock *gl = arg; + + switch (mode) { + case DLM_LOCK_EX: + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + break; + case DLM_LOCK_CW: + gfs2_glock_cb(gl, LM_ST_DEFERRED); + break; + case DLM_LOCK_PR: + gfs2_glock_cb(gl, LM_ST_SHARED); + break; + default: + pr_err("unknown bast mode %d\n", mode); + BUG(); + } +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int make_mode(const unsigned int lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + pr_err("unknown LM state %d\n", lmstate); + BUG(); + return -1; +} + +static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, + const int req) +{ + u32 lkf = 0; + + if (gl->gl_lksb.sb_lvbptr) + lkf |= DLM_LKF_VALBLK; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + else + BUG(); + } + + if (gl->gl_lksb.sb_lkid != 0) { + lkf |= DLM_LKF_CONVERT; + if (test_bit(GLF_BLOCKING, &gl->gl_flags)) + lkf |= DLM_LKF_QUECVT; + } + + return lkf; +} + +static void gfs2_reverse_hex(char *c, u64 value) +{ + *c = '0'; + while (value) { + *c-- = hex_asc[value & 0x0f]; + value >>= 4; + } +} + +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + int req; + u32 lkf; + char strname[GDLM_STRNAME_BYTES] = ""; + + req = make_mode(req_state); + lkf = make_flags(gl, flags, req); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + if (gl->gl_lksb.sb_lkid) { + gfs2_update_request_times(gl); + } else { + memset(strname, ' ', GDLM_STRNAME_BYTES - 1); + strname[GDLM_STRNAME_BYTES - 1] = '\0'; + gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type); + gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number); + gl->gl_dstamp = ktime_get_real(); + } + /* + * Submit the actual lock request. + */ + + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); +} + +static void gdlm_put_lock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; + int error; + + if (gl->gl_lksb.sb_lkid == 0) { + gfs2_glock_free(gl); + return; + } + + clear_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_update_request_times(gl); + + /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && + !lvb_needs_unlock) { + gfs2_glock_free(gl); + return; + } + + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + NULL, gl); + if (error) { + pr_err("gdlm_unlock %x,%llx err=%d\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, error); + return; + } +} + +static void gdlm_cancel(struct gfs2_glock *gl) +{ + struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); +} + +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(__le32)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); +} + +static int all_jid_bits_clear(char *lvb) +{ + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + fs_info(sdp, "control_mount wait1 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int dlm_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) + fs_err(sdp, "control_first_done control NL error %d\n", error); + + return error; +} + +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + + if (old_size >= max_jid + 1) + return 0; + + new_size = old_size + RECOVER_SIZE_INC; + + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_lvb_bits); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u\n", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_atomic(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + +static void gdlm_unmount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: + if (ls->ls_dlm) { + dlm_release_lockspace(ls->ls_dlm, 2); + ls->ls_dlm = NULL; + } + + free_recover_size(ls); +} + +static const match_table_t dlm_tokens = { + { Opt_jid, "jid=%d"}, + { Opt_id, "id=%d"}, + { Opt_first, "first=%d"}, + { Opt_nodir, "nodir=%d"}, + { Opt_err, NULL }, +}; + +const struct lm_lockops gfs2_dlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, + .lm_unmount = gdlm_unmount, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_cancel = gdlm_cancel, + .lm_tokens = &dlm_tokens, +}; + diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c deleted file mode 100644 index 663fee72878..00000000000 --- a/fs/gfs2/locking.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/kmod.h> -#include <linux/fs.h> -#include <linux/delay.h> -#include <linux/lm_interface.h> - -struct lmh_wrapper { - struct list_head lw_list; - const struct lm_lockops *lw_ops; -}; - -/* List of registered low-level locking protocols. A file system selects one - of them by name at mount time, e.g. lock_nolock, lock_dlm. */ - -static LIST_HEAD(lmh_list); -static DEFINE_MUTEX(lmh_lock); - -/** - * gfs2_register_lockproto - Register a low-level locking protocol - * @proto: the protocol definition - * - * Returns: 0 on success, -EXXX on failure - */ - -int gfs2_register_lockproto(const struct lm_lockops *proto) -{ - struct lmh_wrapper *lw; - - mutex_lock(&lmh_lock); - - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { - mutex_unlock(&lmh_lock); - printk(KERN_INFO "GFS2: protocol %s already exists\n", - proto->lm_proto_name); - return -EEXIST; - } - } - - lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL); - if (!lw) { - mutex_unlock(&lmh_lock); - return -ENOMEM; - } - - lw->lw_ops = proto; - list_add(&lw->lw_list, &lmh_list); - - mutex_unlock(&lmh_lock); - - return 0; -} - -/** - * gfs2_unregister_lockproto - Unregister a low-level locking protocol - * @proto: the protocol definition - * - */ - -void gfs2_unregister_lockproto(const struct lm_lockops *proto) -{ - struct lmh_wrapper *lw; - - mutex_lock(&lmh_lock); - - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { - list_del(&lw->lw_list); - mutex_unlock(&lmh_lock); - kfree(lw); - return; - } - } - - mutex_unlock(&lmh_lock); - - printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n", - proto->lm_proto_name); -} - -/** - * gfs2_mount_lockproto - Mount a lock protocol - * @proto_name - the name of the protocol - * @table_name - the name of the lock space - * @host_data - data specific to this host - * @cb - the callback to the code using the lock module - * @sdp - The GFS2 superblock - * @min_lvb_size - the mininum LVB size that the caller can deal with - * @flags - LM_MFLAG_* - * @lockstruct - a structure returned describing the mount - * - * Returns: 0 on success, -EXXX on failure - */ - -int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - struct lmh_wrapper *lw = NULL; - int try = 0; - int error, found; - -retry: - mutex_lock(&lmh_lock); - - found = 0; - list_for_each_entry(lw, &lmh_list, lw_list) { - if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { - found = 1; - break; - } - } - - if (!found) { - if (!try && capable(CAP_SYS_MODULE)) { - try = 1; - mutex_unlock(&lmh_lock); - request_module(proto_name); - goto retry; - } - printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name); - error = -ENOENT; - goto out; - } - - if (!try_module_get(lw->lw_ops->lm_owner)) { - try = 0; - mutex_unlock(&lmh_lock); - msleep(1000); - goto retry; - } - - error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data, - min_lvb_size, flags, lockstruct, fskobj); - if (error) - module_put(lw->lw_ops->lm_owner); -out: - mutex_unlock(&lmh_lock); - return error; -} - -void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) -{ - mutex_lock(&lmh_lock); - lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); - if (lockstruct->ls_ops->lm_owner) - module_put(lockstruct->ls_ops->lm_owner); - mutex_unlock(&lmh_lock); -} - -/** - * gfs2_withdraw_lockproto - abnormally unmount a lock module - * @lockstruct: the lockstruct passed into mount - * - */ - -void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct) -{ - mutex_lock(&lmh_lock); - lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace); - if (lockstruct->ls_ops->lm_owner) - module_put(lockstruct->ls_ops->lm_owner); - mutex_unlock(&lmh_lock); -} - -EXPORT_SYMBOL_GPL(gfs2_register_lockproto); -EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto); - diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile deleted file mode 100644 index 89b93b6b45c..00000000000 --- a/fs/gfs2/locking/dlm/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o -lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o - diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c deleted file mode 100644 index 542a797ac89..00000000000 --- a/fs/gfs2/locking/dlm/lock.c +++ /dev/null @@ -1,527 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -static char junk_lvb[GDLM_LVB_SIZE]; - -static void queue_complete(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - clear_bit(LFL_ACTIVE, &lp->flags); - - spin_lock(&ls->async_lock); - list_add_tail(&lp->clist, &ls->complete); - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -static inline void gdlm_ast(void *astarg) -{ - queue_complete(astarg); -} - -static inline void gdlm_bast(void *astarg, int mode) -{ - struct gdlm_lock *lp = astarg; - struct gdlm_ls *ls = lp->ls; - - if (!mode) { - printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - return; - } - - spin_lock(&ls->async_lock); - if (!lp->bast_mode) { - list_add_tail(&lp->blist, &ls->blocking); - lp->bast_mode = mode; - } else if (lp->bast_mode < mode) - lp->bast_mode = mode; - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -void gdlm_queue_delayed(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - list_add_tail(&lp->delay_list, &ls->delayed); - spin_unlock(&ls->async_lock); -} - -/* convert gfs lock-state to dlm lock-mode */ - -static s16 make_mode(s16 lmstate) -{ - switch (lmstate) { - case LM_ST_UNLOCKED: - return DLM_LOCK_NL; - case LM_ST_EXCLUSIVE: - return DLM_LOCK_EX; - case LM_ST_DEFERRED: - return DLM_LOCK_CW; - case LM_ST_SHARED: - return DLM_LOCK_PR; - } - gdlm_assert(0, "unknown LM state %d", lmstate); - return -1; -} - -/* convert dlm lock-mode to gfs lock-state */ - -s16 gdlm_make_lmstate(s16 dlmmode) -{ - switch (dlmmode) { - case DLM_LOCK_IV: - case DLM_LOCK_NL: - return LM_ST_UNLOCKED; - case DLM_LOCK_EX: - return LM_ST_EXCLUSIVE; - case DLM_LOCK_CW: - return LM_ST_DEFERRED; - case DLM_LOCK_PR: - return LM_ST_SHARED; - } - gdlm_assert(0, "unknown DLM mode %d", dlmmode); - return -1; -} - -/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and - DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ - -static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state) -{ - s16 cur = make_mode(cur_state); - if (lp->cur != DLM_LOCK_IV) - gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur); -} - -static inline unsigned int make_flags(struct gdlm_lock *lp, - unsigned int gfs_flags, - s16 cur, s16 req) -{ - unsigned int lkf = 0; - - if (gfs_flags & LM_FLAG_TRY) - lkf |= DLM_LKF_NOQUEUE; - - if (gfs_flags & LM_FLAG_TRY_1CB) { - lkf |= DLM_LKF_NOQUEUE; - lkf |= DLM_LKF_NOQUEUEBAST; - } - - if (gfs_flags & LM_FLAG_PRIORITY) { - lkf |= DLM_LKF_NOORDER; - lkf |= DLM_LKF_HEADQUE; - } - - if (gfs_flags & LM_FLAG_ANY) { - if (req == DLM_LOCK_PR) - lkf |= DLM_LKF_ALTCW; - else if (req == DLM_LOCK_CW) - lkf |= DLM_LKF_ALTPR; - } - - if (lp->lksb.sb_lkid != 0) { - lkf |= DLM_LKF_CONVERT; - - /* Conversion deadlock avoidance by DLM */ - - if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) && - !(lkf & DLM_LKF_NOQUEUE) && - cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) - lkf |= DLM_LKF_CONVDEADLK; - } - - if (lp->lvb) - lkf |= DLM_LKF_VALBLK; - - return lkf; -} - -/* make_strname - convert GFS lock numbers to a string */ - -static inline void make_strname(const struct lm_lockname *lockname, - struct gdlm_strname *str) -{ - sprintf(str->name, "%8x%16llx", lockname->ln_type, - (unsigned long long)lockname->ln_number); - str->namelen = GDLM_STRNAME_BYTES; -} - -static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, - struct gdlm_lock **lpp) -{ - struct gdlm_lock *lp; - - lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL); - if (!lp) - return -ENOMEM; - - lp->lockname = *name; - make_strname(name, &lp->strname); - lp->ls = ls; - lp->cur = DLM_LOCK_IV; - lp->lvb = NULL; - lp->hold_null = NULL; - INIT_LIST_HEAD(&lp->clist); - INIT_LIST_HEAD(&lp->blist); - INIT_LIST_HEAD(&lp->delay_list); - - spin_lock(&ls->async_lock); - list_add(&lp->all_list, &ls->all_locks); - ls->all_locks_count++; - spin_unlock(&ls->async_lock); - - *lpp = lp; - return 0; -} - -void gdlm_delete_lp(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - if (!list_empty(&lp->clist)) - list_del_init(&lp->clist); - if (!list_empty(&lp->blist)) - list_del_init(&lp->blist); - if (!list_empty(&lp->delay_list)) - list_del_init(&lp->delay_list); - gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - list_del_init(&lp->all_list); - ls->all_locks_count--; - spin_unlock(&ls->async_lock); - - kfree(lp); -} - -int gdlm_get_lock(void *lockspace, struct lm_lockname *name, - void **lockp) -{ - struct gdlm_lock *lp; - int error; - - error = gdlm_create_lp(lockspace, name, &lp); - - *lockp = lp; - return error; -} - -void gdlm_put_lock(void *lock) -{ - gdlm_delete_lp(lock); -} - -unsigned int gdlm_do_lock(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - int error, bast = 1; - - /* - * When recovery is in progress, delay lock requests for submission - * once recovery is done. Requests for recovery (NOEXP) and unlocks - * can pass. - */ - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) { - gdlm_queue_delayed(lp); - return LM_OUT_ASYNC; - } - - /* - * Submit the actual lock request. - */ - - if (test_bit(LFL_NOBAST, &lp->flags)) - bast = 0; - - set_bit(LFL_ACTIVE, &lp->flags); - - log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid, - lp->cur, lp->req, lp->lkf); - - error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf, - lp->strname.name, lp->strname.namelen, 0, gdlm_ast, - lp, bast ? gdlm_bast : NULL); - - if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { - lp->lksb.sb_status = -EAGAIN; - queue_complete(lp); - error = 0; - } - - if (error) { - log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x " - "flags=%lx", ls->fsname, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, error, - lp->cur, lp->req, lp->lkf, lp->flags); - return LM_OUT_ERROR; - } - return LM_OUT_ASYNC; -} - -static unsigned int gdlm_do_unlock(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - unsigned int lkf = 0; - int error; - - set_bit(LFL_DLM_UNLOCK, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - - if (lp->lvb) - lkf = DLM_LKF_VALBLK; - - log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->lksb.sb_lkid, lp->cur, lkf); - - error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp); - - if (error) { - log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x " - "flags=%lx", ls->fsname, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, error, - lp->cur, lp->req, lp->lkf, lp->flags); - return LM_OUT_ERROR; - } - return LM_OUT_ASYNC; -} - -unsigned int gdlm_lock(void *lock, unsigned int cur_state, - unsigned int req_state, unsigned int flags) -{ - struct gdlm_lock *lp = lock; - - clear_bit(LFL_DLM_CANCEL, &lp->flags); - if (flags & LM_FLAG_NOEXP) - set_bit(LFL_NOBLOCK, &lp->flags); - - check_cur_state(lp, cur_state); - lp->req = make_mode(req_state); - lp->lkf = make_flags(lp, flags, lp->cur, lp->req); - - return gdlm_do_lock(lp); -} - -unsigned int gdlm_unlock(void *lock, unsigned int cur_state) -{ - struct gdlm_lock *lp = lock; - - clear_bit(LFL_DLM_CANCEL, &lp->flags); - if (lp->cur == DLM_LOCK_IV) - return 0; - return gdlm_do_unlock(lp); -} - -void gdlm_cancel(void *lock) -{ - struct gdlm_lock *lp = lock; - struct gdlm_ls *ls = lp->ls; - int error, delay_list = 0; - - if (test_bit(LFL_DLM_CANCEL, &lp->flags)) - return; - - log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - - spin_lock(&ls->async_lock); - if (!list_empty(&lp->delay_list)) { - list_del_init(&lp->delay_list); - delay_list = 1; - } - spin_unlock(&ls->async_lock); - - if (delay_list) { - set_bit(LFL_CANCEL, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - queue_complete(lp); - return; - } - - if (!test_bit(LFL_ACTIVE, &lp->flags) || - test_bit(LFL_DLM_UNLOCK, &lp->flags)) { - log_info("gdlm_cancel skip %x,%llx flags %lx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - return; - } - - /* the lock is blocked in the dlm */ - - set_bit(LFL_DLM_CANCEL, &lp->flags); - set_bit(LFL_ACTIVE, &lp->flags); - - error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL, - NULL, lp); - - log_info("gdlm_cancel rv %d %x,%llx flags %lx", error, - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, lp->flags); - - if (error == -EBUSY) - clear_bit(LFL_DLM_CANCEL, &lp->flags); -} - -static int gdlm_add_lvb(struct gdlm_lock *lp) -{ - char *lvb; - - lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL); - if (!lvb) - return -ENOMEM; - - lp->lksb.sb_lvbptr = lvb; - lp->lvb = lvb; - return 0; -} - -static void gdlm_del_lvb(struct gdlm_lock *lp) -{ - kfree(lp->lvb); - lp->lvb = NULL; - lp->lksb.sb_lvbptr = NULL; -} - -static int gdlm_ast_wait(void *word) -{ - schedule(); - return 0; -} - -/* This can do a synchronous dlm request (requiring a lock_dlm thread to get - the completion) because gfs won't call hold_lvb() during a callback (from - the context of a lock_dlm thread). */ - -static int hold_null_lock(struct gdlm_lock *lp) -{ - struct gdlm_lock *lpn = NULL; - int error; - - if (lp->hold_null) { - printk(KERN_INFO "lock_dlm: lvb already held\n"); - return 0; - } - - error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn); - if (error) - goto out; - - lpn->lksb.sb_lvbptr = junk_lvb; - lpn->lvb = junk_lvb; - - lpn->req = DLM_LOCK_NL; - lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; - set_bit(LFL_NOBAST, &lpn->flags); - set_bit(LFL_INLOCK, &lpn->flags); - set_bit(LFL_AST_WAIT, &lpn->flags); - - gdlm_do_lock(lpn); - wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE); - error = lpn->lksb.sb_status; - if (error) { - printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", - error); - gdlm_delete_lp(lpn); - lpn = NULL; - } -out: - lp->hold_null = lpn; - return error; -} - -/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get - the completion) because gfs may call unhold_lvb() during a callback (from - the context of a lock_dlm thread) which could cause a deadlock since the - other lock_dlm thread could be engaged in recovery. */ - -static void unhold_null_lock(struct gdlm_lock *lp) -{ - struct gdlm_lock *lpn = lp->hold_null; - - gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - lpn->lksb.sb_lvbptr = NULL; - lpn->lvb = NULL; - set_bit(LFL_UNLOCK_DELETE, &lpn->flags); - gdlm_do_unlock(lpn); - lp->hold_null = NULL; -} - -/* Acquire a NL lock because gfs requires the value block to remain - intact on the resource while the lvb is "held" even if it's holding no locks - on the resource. */ - -int gdlm_hold_lvb(void *lock, char **lvbp) -{ - struct gdlm_lock *lp = lock; - int error; - - error = gdlm_add_lvb(lp); - if (error) - return error; - - *lvbp = lp->lvb; - - error = hold_null_lock(lp); - if (error) - gdlm_del_lvb(lp); - - return error; -} - -void gdlm_unhold_lvb(void *lock, char *lvb) -{ - struct gdlm_lock *lp = lock; - - unhold_null_lock(lp); - gdlm_del_lvb(lp); -} - -void gdlm_submit_delayed(struct gdlm_ls *ls) -{ - struct gdlm_lock *lp, *safe; - - spin_lock(&ls->async_lock); - list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) { - list_del_init(&lp->delay_list); - list_add_tail(&lp->delay_list, &ls->submit); - } - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -int gdlm_release_all_locks(struct gdlm_ls *ls) -{ - struct gdlm_lock *lp, *safe; - int count = 0; - - spin_lock(&ls->async_lock); - list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) { - list_del_init(&lp->all_list); - - if (lp->lvb && lp->lvb != junk_lvb) - kfree(lp->lvb); - kfree(lp); - count++; - } - spin_unlock(&ls->async_lock); - - return count; -} - diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h deleted file mode 100644 index 9e8265d2837..00000000000 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef LOCK_DLM_DOT_H -#define LOCK_DLM_DOT_H - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/list.h> -#include <linux/socket.h> -#include <linux/delay.h> -#include <linux/kthread.h> -#include <linux/kobject.h> -#include <linux/fcntl.h> -#include <linux/wait.h> -#include <net/sock.h> - -#include <linux/dlm.h> -#include <linux/lm_interface.h> - -/* - * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a - * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module - * as "lock_dlm". - */ - -#define GDLM_STRNAME_BYTES 24 -#define GDLM_LVB_SIZE 32 -#define GDLM_DROP_COUNT 0 -#define GDLM_DROP_PERIOD 60 -#define GDLM_NAME_LEN 128 - -/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number). - We sprintf these numbers into a 24 byte string of hex values to make them - human-readable (to make debugging simpler.) */ - -struct gdlm_strname { - unsigned char name[GDLM_STRNAME_BYTES]; - unsigned short namelen; -}; - -enum { - DFL_BLOCK_LOCKS = 0, - DFL_SPECTATOR = 1, - DFL_WITHDRAW = 2, -}; - -struct gdlm_ls { - u32 id; - int jid; - int first; - int first_done; - unsigned long flags; - struct kobject kobj; - char clustername[GDLM_NAME_LEN]; - char fsname[GDLM_NAME_LEN]; - int fsflags; - dlm_lockspace_t *dlm_lockspace; - lm_callback_t fscb; - struct gfs2_sbd *sdp; - int recover_jid; - int recover_jid_done; - int recover_jid_status; - spinlock_t async_lock; - struct list_head complete; - struct list_head blocking; - struct list_head delayed; - struct list_head submit; - struct list_head all_locks; - u32 all_locks_count; - wait_queue_head_t wait_control; - struct task_struct *thread1; - struct task_struct *thread2; - wait_queue_head_t thread_wait; - unsigned long drop_time; - int drop_locks_count; - int drop_locks_period; -}; - -enum { - LFL_NOBLOCK = 0, - LFL_NOCACHE = 1, - LFL_DLM_UNLOCK = 2, - LFL_DLM_CANCEL = 3, - LFL_SYNC_LVB = 4, - LFL_FORCE_PROMOTE = 5, - LFL_REREQUEST = 6, - LFL_ACTIVE = 7, - LFL_INLOCK = 8, - LFL_CANCEL = 9, - LFL_NOBAST = 10, - LFL_HEADQUE = 11, - LFL_UNLOCK_DELETE = 12, - LFL_AST_WAIT = 13, -}; - -struct gdlm_lock { - struct gdlm_ls *ls; - struct lm_lockname lockname; - struct gdlm_strname strname; - char *lvb; - struct dlm_lksb lksb; - - s16 cur; - s16 req; - s16 prev_req; - u32 lkf; /* dlm flags DLM_LKF_ */ - unsigned long flags; /* lock_dlm flags LFL_ */ - - int bast_mode; /* protected by async_lock */ - - struct list_head clist; /* complete */ - struct list_head blist; /* blocking */ - struct list_head delay_list; /* delayed */ - struct list_head all_list; /* all locks for the fs */ - struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ -}; - -#define gdlm_assert(assertion, fmt, args...) \ -do { \ - if (unlikely(!(assertion))) { \ - printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \ - "lock_dlm: " fmt "\n", \ - #assertion, ##args); \ - BUG(); \ - } \ -} while (0) - -#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg) -#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg) -#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg) -#ifdef LOCK_DLM_LOG_DEBUG -#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg) -#else -#define log_debug(fmt, arg...) -#endif - -/* sysfs.c */ - -int gdlm_sysfs_init(void); -void gdlm_sysfs_exit(void); -int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *); -void gdlm_kobject_release(struct gdlm_ls *); - -/* thread.c */ - -int gdlm_init_threads(struct gdlm_ls *); -void gdlm_release_threads(struct gdlm_ls *); - -/* lock.c */ - -s16 gdlm_make_lmstate(s16); -void gdlm_queue_delayed(struct gdlm_lock *); -void gdlm_submit_delayed(struct gdlm_ls *); -int gdlm_release_all_locks(struct gdlm_ls *); -void gdlm_delete_lp(struct gdlm_lock *); -unsigned int gdlm_do_lock(struct gdlm_lock *); - -int gdlm_get_lock(void *, struct lm_lockname *, void **); -void gdlm_put_lock(void *); -unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int); -unsigned int gdlm_unlock(void *, unsigned int); -void gdlm_cancel(void *); -int gdlm_hold_lvb(void *, char **); -void gdlm_unhold_lvb(void *, char *); - -/* plock.c */ - -int gdlm_plock_init(void); -void gdlm_plock_exit(void); -int gdlm_plock(void *, struct lm_lockname *, struct file *, int, - struct file_lock *); -int gdlm_plock_get(void *, struct lm_lockname *, struct file *, - struct file_lock *); -int gdlm_punlock(void *, struct lm_lockname *, struct file *, - struct file_lock *); -#endif - diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c deleted file mode 100644 index a0e7eda643e..00000000000 --- a/fs/gfs2/locking/dlm/main.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/init.h> - -#include "lock_dlm.h" - -extern struct lm_lockops gdlm_ops; - -static int __init init_lock_dlm(void) -{ - int error; - - error = gfs2_register_lockproto(&gdlm_ops); - if (error) { - printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n", - error); - return error; - } - - error = gdlm_sysfs_init(); - if (error) { - gfs2_unregister_lockproto(&gdlm_ops); - return error; - } - - error = gdlm_plock_init(); - if (error) { - gdlm_sysfs_exit(); - gfs2_unregister_lockproto(&gdlm_ops); - return error; - } - - printk(KERN_INFO - "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); - return 0; -} - -static void __exit exit_lock_dlm(void) -{ - gdlm_plock_exit(); - gdlm_sysfs_exit(); - gfs2_unregister_lockproto(&gdlm_ops); -} - -module_init(init_lock_dlm); -module_exit(exit_lock_dlm); - -MODULE_DESCRIPTION("GFS DLM Locking Module"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c deleted file mode 100644 index f2efff42422..00000000000 --- a/fs/gfs2/locking/dlm/mount.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -const struct lm_lockops gdlm_ops; - - -static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, - int flags, char *table_name) -{ - struct gdlm_ls *ls; - char buf[256], *p; - - ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL); - if (!ls) - return NULL; - - ls->drop_locks_count = GDLM_DROP_COUNT; - ls->drop_locks_period = GDLM_DROP_PERIOD; - ls->fscb = cb; - ls->sdp = sdp; - ls->fsflags = flags; - spin_lock_init(&ls->async_lock); - INIT_LIST_HEAD(&ls->complete); - INIT_LIST_HEAD(&ls->blocking); - INIT_LIST_HEAD(&ls->delayed); - INIT_LIST_HEAD(&ls->submit); - INIT_LIST_HEAD(&ls->all_locks); - init_waitqueue_head(&ls->thread_wait); - init_waitqueue_head(&ls->wait_control); - ls->thread1 = NULL; - ls->thread2 = NULL; - ls->drop_time = jiffies; - ls->jid = -1; - - strncpy(buf, table_name, 256); - buf[255] = '\0'; - - p = strchr(buf, ':'); - if (!p) { - log_info("invalid table_name \"%s\"", table_name); - kfree(ls); - return NULL; - } - *p = '\0'; - p++; - - strncpy(ls->clustername, buf, GDLM_NAME_LEN); - strncpy(ls->fsname, p, GDLM_NAME_LEN); - - return ls; -} - -static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir) -{ - char data[256]; - char *options, *x, *y; - int error = 0; - - memset(data, 0, 256); - strncpy(data, data_arg, 255); - - if (!strlen(data)) { - log_error("no mount options, (u)mount helpers not installed"); - return -EINVAL; - } - - for (options = data; (x = strsep(&options, ":")); ) { - if (!*x) - continue; - - y = strchr(x, '='); - if (y) - *y++ = 0; - - if (!strcmp(x, "jid")) { - if (!y) { - log_error("need argument to jid"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->jid); - - } else if (!strcmp(x, "first")) { - if (!y) { - log_error("need argument to first"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->first); - - } else if (!strcmp(x, "id")) { - if (!y) { - log_error("need argument to id"); - error = -EINVAL; - break; - } - sscanf(y, "%u", &ls->id); - - } else if (!strcmp(x, "nodir")) { - if (!y) { - log_error("need argument to nodir"); - error = -EINVAL; - break; - } - sscanf(y, "%u", nodir); - - } else { - log_error("unkonwn option: %s", x); - error = -EINVAL; - break; - } - } - - return error; -} - -static int gdlm_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - struct gdlm_ls *ls; - int error = -ENOMEM, nodir = 0; - - if (min_lvb_size > GDLM_LVB_SIZE) - goto out; - - ls = init_gdlm(cb, cb_data, flags, table_name); - if (!ls) - goto out; - - error = make_args(ls, host_data, &nodir); - if (error) - goto out; - - error = gdlm_init_threads(ls); - if (error) - goto out_free; - - error = gdlm_kobject_setup(ls, fskobj); - if (error) - goto out_thread; - - error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), - &ls->dlm_lockspace, - DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), - GDLM_LVB_SIZE); - if (error) { - log_error("dlm_new_lockspace error %d", error); - goto out_kobj; - } - - lockstruct->ls_jid = ls->jid; - lockstruct->ls_first = ls->first; - lockstruct->ls_lockspace = ls; - lockstruct->ls_ops = &gdlm_ops; - lockstruct->ls_flags = 0; - lockstruct->ls_lvb_size = GDLM_LVB_SIZE; - return 0; - -out_kobj: - gdlm_kobject_release(ls); -out_thread: - gdlm_release_threads(ls); -out_free: - kfree(ls); -out: - return error; -} - -static void gdlm_unmount(void *lockspace) -{ - struct gdlm_ls *ls = lockspace; - int rv; - - log_debug("unmount flags %lx", ls->flags); - - /* FIXME: serialize unmount and withdraw in case they - happen at once. Also, if unmount follows withdraw, - wait for withdraw to finish. */ - - if (test_bit(DFL_WITHDRAW, &ls->flags)) - goto out; - - gdlm_kobject_release(ls); - dlm_release_lockspace(ls->dlm_lockspace, 2); - gdlm_release_threads(ls); - rv = gdlm_release_all_locks(ls); - if (rv) - log_info("gdlm_unmount: %d stray locks freed", rv); -out: - kfree(ls); -} - -static void gdlm_recovery_done(void *lockspace, unsigned int jid, - unsigned int message) -{ - struct gdlm_ls *ls = lockspace; - ls->recover_jid_done = jid; - ls->recover_jid_status = message; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); -} - -static void gdlm_others_may_mount(void *lockspace) -{ - struct gdlm_ls *ls = lockspace; - ls->first_done = 1; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); -} - -/* Userspace gets the offline uevent, blocks new gfs locks on - other mounters, and lets us know (sets WITHDRAW flag). Then, - userspace leaves the mount group while we leave the lockspace. */ - -static void gdlm_withdraw(void *lockspace) -{ - struct gdlm_ls *ls = lockspace; - - kobject_uevent(&ls->kobj, KOBJ_OFFLINE); - - wait_event_interruptible(ls->wait_control, - test_bit(DFL_WITHDRAW, &ls->flags)); - - dlm_release_lockspace(ls->dlm_lockspace, 2); - gdlm_release_threads(ls); - gdlm_release_all_locks(ls); - gdlm_kobject_release(ls); -} - -const struct lm_lockops gdlm_ops = { - .lm_proto_name = "lock_dlm", - .lm_mount = gdlm_mount, - .lm_others_may_mount = gdlm_others_may_mount, - .lm_unmount = gdlm_unmount, - .lm_withdraw = gdlm_withdraw, - .lm_get_lock = gdlm_get_lock, - .lm_put_lock = gdlm_put_lock, - .lm_lock = gdlm_lock, - .lm_unlock = gdlm_unlock, - .lm_plock = gdlm_plock, - .lm_punlock = gdlm_punlock, - .lm_plock_get = gdlm_plock_get, - .lm_cancel = gdlm_cancel, - .lm_hold_lvb = gdlm_hold_lvb, - .lm_unhold_lvb = gdlm_unhold_lvb, - .lm_recovery_done = gdlm_recovery_done, - .lm_owner = THIS_MODULE, -}; - diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c deleted file mode 100644 index 2ebd374b314..00000000000 --- a/fs/gfs2/locking/dlm/plock.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (C) 2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/miscdevice.h> -#include <linux/lock_dlm_plock.h> -#include <linux/poll.h> - -#include "lock_dlm.h" - - -static spinlock_t ops_lock; -static struct list_head send_list; -static struct list_head recv_list; -static wait_queue_head_t send_wq; -static wait_queue_head_t recv_wq; - -struct plock_op { - struct list_head list; - int done; - struct gdlm_plock_info info; -}; - -struct plock_xop { - struct plock_op xop; - void *callback; - void *fl; - void *file; - struct file_lock flc; -}; - - -static inline void set_version(struct gdlm_plock_info *info) -{ - info->version[0] = GDLM_PLOCK_VERSION_MAJOR; - info->version[1] = GDLM_PLOCK_VERSION_MINOR; - info->version[2] = GDLM_PLOCK_VERSION_PATCH; -} - -static int check_version(struct gdlm_plock_info *info) -{ - if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) || - (GDLM_PLOCK_VERSION_MINOR < info->version[1])) { - log_error("plock device version mismatch: " - "kernel (%u.%u.%u), user (%u.%u.%u)", - GDLM_PLOCK_VERSION_MAJOR, - GDLM_PLOCK_VERSION_MINOR, - GDLM_PLOCK_VERSION_PATCH, - info->version[0], - info->version[1], - info->version[2]); - return -EINVAL; - } - return 0; -} - -static void send_op(struct plock_op *op) -{ - set_version(&op->info); - INIT_LIST_HEAD(&op->list); - spin_lock(&ops_lock); - list_add_tail(&op->list, &send_list); - spin_unlock(&ops_lock); - wake_up(&send_wq); -} - -int gdlm_plock(void *lockspace, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - struct plock_op *op; - struct plock_xop *xop; - int rv; - - xop = kzalloc(sizeof(*xop), GFP_KERNEL); - if (!xop) - return -ENOMEM; - - op = &xop->xop; - op->info.optype = GDLM_PLOCK_OP_LOCK; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); - op->info.wait = IS_SETLKW(cmd); - op->info.fsid = ls->id; - op->info.number = name->ln_number; - op->info.start = fl->fl_start; - op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) { - /* fl_owner is lockd which doesn't distinguish - processes on the nfs client */ - op->info.owner = (__u64) fl->fl_pid; - xop->callback = fl->fl_lmops->fl_grant; - locks_init_lock(&xop->flc); - locks_copy_lock(&xop->flc, fl); - xop->fl = fl; - xop->file = file; - } else { - op->info.owner = (__u64)(long) fl->fl_owner; - xop->callback = NULL; - } - - send_op(op); - - if (xop->callback == NULL) - wait_event(recv_wq, (op->done != 0)); - else - return -EINPROGRESS; - - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - printk(KERN_INFO "plock op on list\n"); - list_del(&op->list); - } - spin_unlock(&ops_lock); - - rv = op->info.rv; - - if (!rv) { - if (posix_lock_file_wait(file, fl) < 0) - log_error("gdlm_plock: vfs lock error %x,%llx", - name->ln_type, - (unsigned long long)name->ln_number); - } - - kfree(xop); - return rv; -} - -/* Returns failure iff a succesful lock operation should be canceled */ -static int gdlm_plock_callback(struct plock_op *op) -{ - struct file *file; - struct file_lock *fl; - struct file_lock *flc; - int (*notify)(void *, void *, int) = NULL; - struct plock_xop *xop = (struct plock_xop *)op; - int rv = 0; - - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - printk(KERN_INFO "plock op on list\n"); - list_del(&op->list); - } - spin_unlock(&ops_lock); - - /* check if the following 2 are still valid or make a copy */ - file = xop->file; - flc = &xop->flc; - fl = xop->fl; - notify = xop->callback; - - if (op->info.rv) { - notify(flc, NULL, op->info.rv); - goto out; - } - - /* got fs lock; bookkeep locally as well: */ - flc->fl_flags &= ~FL_SLEEP; - if (posix_lock_file(file, flc, NULL)) { - /* - * This can only happen in the case of kmalloc() failure. - * The filesystem's own lock is the authoritative lock, - * so a failure to get the lock locally is not a disaster. - * As long as GFS cannot reliably cancel locks (especially - * in a low-memory situation), we're better off ignoring - * this failure than trying to recover. - */ - log_error("gdlm_plock: vfs lock error file %p fl %p", - file, fl); - } - - rv = notify(flc, NULL, 0); - if (rv) { - /* XXX: We need to cancel the fs lock here: */ - printk("gfs2 lock granted after lock request failed;" - " dangling lock!\n"); - goto out; - } - -out: - kfree(xop); - return rv; -} - -int gdlm_punlock(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - struct plock_op *op; - int rv; - - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - return -ENOMEM; - - if (posix_lock_file_wait(file, fl) < 0) - log_error("gdlm_punlock: vfs unlock error %x,%llx", - name->ln_type, (unsigned long long)name->ln_number); - - op->info.optype = GDLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->id; - op->info.number = name->ln_number; - op->info.start = fl->fl_start; - op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; - - send_op(op); - wait_event(recv_wq, (op->done != 0)); - - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - printk(KERN_INFO "punlock op on list\n"); - list_del(&op->list); - } - spin_unlock(&ops_lock); - - rv = op->info.rv; - - if (rv == -ENOENT) - rv = 0; - - kfree(op); - return rv; -} - -int gdlm_plock_get(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - struct gdlm_ls *ls = lockspace; - struct plock_op *op; - int rv; - - op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) - return -ENOMEM; - - op->info.optype = GDLM_PLOCK_OP_GET; - op->info.pid = fl->fl_pid; - op->info.ex = (fl->fl_type == F_WRLCK); - op->info.fsid = ls->id; - op->info.number = name->ln_number; - op->info.start = fl->fl_start; - op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->fl_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; - - send_op(op); - wait_event(recv_wq, (op->done != 0)); - - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - printk(KERN_INFO "plock_get op on list\n"); - list_del(&op->list); - } - spin_unlock(&ops_lock); - - /* info.rv from userspace is 1 for conflict, 0 for no-conflict, - -ENOENT if there are no locks on the file */ - - rv = op->info.rv; - - fl->fl_type = F_UNLCK; - if (rv == -ENOENT) - rv = 0; - else if (rv > 0) { - fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; - fl->fl_pid = op->info.pid; - fl->fl_start = op->info.start; - fl->fl_end = op->info.end; - rv = 0; - } - - kfree(op); - return rv; -} - -/* a read copies out one plock request from the send list */ -static ssize_t dev_read(struct file *file, char __user *u, size_t count, - loff_t *ppos) -{ - struct gdlm_plock_info info; - struct plock_op *op = NULL; - - if (count < sizeof(info)) - return -EINVAL; - - spin_lock(&ops_lock); - if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); - list_move(&op->list, &recv_list); - memcpy(&info, &op->info, sizeof(info)); - } - spin_unlock(&ops_lock); - - if (!op) - return -EAGAIN; - - if (copy_to_user(u, &info, sizeof(info))) - return -EFAULT; - return sizeof(info); -} - -/* a write copies in one plock result that should match a plock_op - on the recv list */ -static ssize_t dev_write(struct file *file, const char __user *u, size_t count, - loff_t *ppos) -{ - struct gdlm_plock_info info; - struct plock_op *op; - int found = 0; - - if (count != sizeof(info)) - return -EINVAL; - - if (copy_from_user(&info, u, sizeof(info))) - return -EFAULT; - - if (check_version(&info)) - return -EINVAL; - - spin_lock(&ops_lock); - list_for_each_entry(op, &recv_list, list) { - if (op->info.fsid == info.fsid && op->info.number == info.number && - op->info.owner == info.owner) { - list_del_init(&op->list); - found = 1; - op->done = 1; - memcpy(&op->info, &info, sizeof(info)); - break; - } - } - spin_unlock(&ops_lock); - - if (found) { - struct plock_xop *xop; - xop = (struct plock_xop *)op; - if (xop->callback) - count = gdlm_plock_callback(op); - else - wake_up(&recv_wq); - } else - printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid, - (unsigned long long)info.number); - return count; -} - -static unsigned int dev_poll(struct file *file, poll_table *wait) -{ - unsigned int mask = 0; - - poll_wait(file, &send_wq, wait); - - spin_lock(&ops_lock); - if (!list_empty(&send_list)) - mask = POLLIN | POLLRDNORM; - spin_unlock(&ops_lock); - - return mask; -} - -static const struct file_operations dev_fops = { - .read = dev_read, - .write = dev_write, - .poll = dev_poll, - .owner = THIS_MODULE -}; - -static struct miscdevice plock_dev_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = GDLM_PLOCK_MISC_NAME, - .fops = &dev_fops -}; - -int gdlm_plock_init(void) -{ - int rv; - - spin_lock_init(&ops_lock); - INIT_LIST_HEAD(&send_list); - INIT_LIST_HEAD(&recv_list); - init_waitqueue_head(&send_wq); - init_waitqueue_head(&recv_wq); - - rv = misc_register(&plock_dev_misc); - if (rv) - printk(KERN_INFO "gdlm_plock_init: misc_register failed %d", - rv); - return rv; -} - -void gdlm_plock_exit(void) -{ - if (misc_deregister(&plock_dev_misc) < 0) - printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed"); -} - diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c deleted file mode 100644 index a87b0983976..00000000000 --- a/fs/gfs2/locking/dlm/sysfs.c +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/ctype.h> -#include <linux/stat.h> - -#include "lock_dlm.h" - -extern struct lm_lockops gdlm_ops; - -static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); -} - -static ssize_t block_show(struct gdlm_ls *ls, char *buf) -{ - ssize_t ret; - int val = 0; - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags)) - val = 1; - ret = sprintf(buf, "%d\n", val); - return ret; -} - -static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ssize_t ret = len; - int val; - - val = simple_strtol(buf, NULL, 0); - - if (val == 1) - set_bit(DFL_BLOCK_LOCKS, &ls->flags); - else if (val == 0) { - clear_bit(DFL_BLOCK_LOCKS, &ls->flags); - gdlm_submit_delayed(ls); - } else { - ret = -EINVAL; - } - return ret; -} - -static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf) -{ - ssize_t ret; - int val = 0; - - if (test_bit(DFL_WITHDRAW, &ls->flags)) - val = 1; - ret = sprintf(buf, "%d\n", val); - return ret; -} - -static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ssize_t ret = len; - int val; - - val = simple_strtol(buf, NULL, 0); - - if (val == 1) - set_bit(DFL_WITHDRAW, &ls->flags); - else - ret = -EINVAL; - wake_up(&ls->wait_control); - return ret; -} - -static ssize_t id_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%u\n", ls->id); -} - -static ssize_t jid_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->jid); -} - -static ssize_t first_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->first); -} - -static ssize_t first_done_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->first_done); -} - -static ssize_t recover_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid); -} - -static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ls->recover_jid = simple_strtol(buf, NULL, 0); - ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid); - return len; -} - -static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid_done); -} - -static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->recover_jid_status); -} - -static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->drop_locks_count); -} - -static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ls->drop_locks_count = simple_strtol(buf, NULL, 0); - return len; -} - -struct gdlm_attr { - struct attribute attr; - ssize_t (*show)(struct gdlm_ls *, char *); - ssize_t (*store)(struct gdlm_ls *, const char *, size_t); -}; - -#define GDLM_ATTR(_name,_mode,_show,_store) \ -static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) - -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, id_show, NULL); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, first_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0644, recover_show, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); -GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store); - -static struct attribute *gdlm_attrs[] = { - &gdlm_attr_proto_name.attr, - &gdlm_attr_block.attr, - &gdlm_attr_withdraw.attr, - &gdlm_attr_id.attr, - &gdlm_attr_jid.attr, - &gdlm_attr_first.attr, - &gdlm_attr_first_done.attr, - &gdlm_attr_recover.attr, - &gdlm_attr_recover_done.attr, - &gdlm_attr_recover_status.attr, - &gdlm_attr_drop_count.attr, - NULL, -}; - -static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); - struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); - return a->show ? a->show(ls, buf) : 0; -} - -static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); - struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); - return a->store ? a->store(ls, buf, len) : len; -} - -static struct sysfs_ops gdlm_attr_ops = { - .show = gdlm_attr_show, - .store = gdlm_attr_store, -}; - -static struct kobj_type gdlm_ktype = { - .default_attrs = gdlm_attrs, - .sysfs_ops = &gdlm_attr_ops, -}; - -static struct kset *gdlm_kset; - -int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) -{ - int error; - - ls->kobj.kset = gdlm_kset; - error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj, - "lock_module"); - if (error) - log_error("can't register kobj %d", error); - kobject_uevent(&ls->kobj, KOBJ_ADD); - - return error; -} - -void gdlm_kobject_release(struct gdlm_ls *ls) -{ - kobject_put(&ls->kobj); -} - -int gdlm_sysfs_init(void) -{ - gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); - if (!gdlm_kset) { - printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__); - return -ENOMEM; - } - return 0; -} - -void gdlm_sysfs_exit(void) -{ - kset_unregister(gdlm_kset); -} - diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c deleted file mode 100644 index 521694fc19d..00000000000 --- a/fs/gfs2/locking/dlm/thread.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include "lock_dlm.h" - -/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm - thread gets to it. */ - -static void queue_submit(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - - spin_lock(&ls->async_lock); - list_add_tail(&lp->delay_list, &ls->submit); - spin_unlock(&ls->async_lock); - wake_up(&ls->thread_wait); -} - -static void process_blocking(struct gdlm_lock *lp, int bast_mode) -{ - struct gdlm_ls *ls = lp->ls; - unsigned int cb = 0; - - switch (gdlm_make_lmstate(bast_mode)) { - case LM_ST_EXCLUSIVE: - cb = LM_CB_NEED_E; - break; - case LM_ST_DEFERRED: - cb = LM_CB_NEED_D; - break; - case LM_ST_SHARED: - cb = LM_CB_NEED_S; - break; - default: - gdlm_assert(0, "unknown bast mode %u", lp->bast_mode); - } - - ls->fscb(ls->sdp, cb, &lp->lockname); -} - -static void wake_up_ast(struct gdlm_lock *lp) -{ - clear_bit(LFL_AST_WAIT, &lp->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&lp->flags, LFL_AST_WAIT); -} - -static void process_complete(struct gdlm_lock *lp) -{ - struct gdlm_ls *ls = lp->ls; - struct lm_async_cb acb; - s16 prev_mode = lp->cur; - - memset(&acb, 0, sizeof(acb)); - - if (lp->lksb.sb_status == -DLM_ECANCEL) { - log_info("complete dlm cancel %x,%llx flags %lx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { - if (lp->lksb.sb_status != -DLM_EUNLOCK) { - log_info("unlock sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - return; - } - - lp->cur = DLM_LOCK_IV; - lp->req = DLM_LOCK_IV; - lp->lksb.sb_lkid = 0; - - if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { - gdlm_delete_lp(lp); - return; - } - goto out; - } - - if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) - memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); - - if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { - if (lp->req == DLM_LOCK_PR) - lp->req = DLM_LOCK_CW; - else if (lp->req == DLM_LOCK_CW) - lp->req = DLM_LOCK_PR; - } - - /* - * A canceled lock request. The lock was just taken off the delayed - * list and was never even submitted to dlm. - */ - - if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { - log_info("complete internal cancel %x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CANCELED; - goto out; - } - - /* - * An error occured. - */ - - if (lp->lksb.sb_status) { - /* a "normal" error */ - if ((lp->lksb.sb_status == -EAGAIN) && - (lp->lkf & DLM_LKF_NOQUEUE)) { - lp->req = lp->cur; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } - - /* this could only happen with cancels I think */ - log_info("ast sb_status %d %x,%llx flags %lx", - lp->lksb.sb_status, lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->flags); - return; - } - - /* - * This is an AST for an EX->EX conversion for sync_lvb from GFS. - */ - - if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { - wake_up_ast(lp); - return; - } - - /* - * A lock has been demoted to NL because it initially completed during - * BLOCK_LOCKS. Now it must be requested in the originally requested - * mode. - */ - - if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { - gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number); - - lp->cur = DLM_LOCK_NL; - lp->req = lp->prev_req; - lp->prev_req = DLM_LOCK_IV; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - set_bit(LFL_NOCACHE, &lp->flags); - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags)) - gdlm_queue_delayed(lp); - else - queue_submit(lp); - return; - } - - /* - * A request is granted during dlm recovery. It may be granted - * because the locks of a failed node were cleared. In that case, - * there may be inconsistent data beneath this lock and we must wait - * for recovery to complete to use it. When gfs recovery is done this - * granted lock will be converted to NL and then reacquired in this - * granted state. - */ - - if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && - !test_bit(LFL_NOBLOCK, &lp->flags) && - lp->req != DLM_LOCK_NL) { - - lp->cur = lp->req; - lp->prev_req = lp->req; - lp->req = DLM_LOCK_NL; - lp->lkf |= DLM_LKF_CONVERT; - lp->lkf &= ~DLM_LKF_CONVDEADLK; - - log_debug("rereq %x,%llx id %x %d,%d", - lp->lockname.ln_type, - (unsigned long long)lp->lockname.ln_number, - lp->lksb.sb_lkid, lp->cur, lp->req); - - set_bit(LFL_REREQUEST, &lp->flags); - queue_submit(lp); - return; - } - - /* - * DLM demoted the lock to NL before it was granted so GFS must be - * told it cannot cache data for this lock. - */ - - if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) - set_bit(LFL_NOCACHE, &lp->flags); - -out: - /* - * This is an internal lock_dlm lock - */ - - if (test_bit(LFL_INLOCK, &lp->flags)) { - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - wake_up_ast(lp); - return; - } - - /* - * Normal completion of a lock request. Tell GFS it now has the lock. - */ - - clear_bit(LFL_NOBLOCK, &lp->flags); - lp->cur = lp->req; - - acb.lc_name = lp->lockname; - acb.lc_ret |= gdlm_make_lmstate(lp->cur); - - if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) && - (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL)) - acb.lc_ret |= LM_OUT_CACHEABLE; - - ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); -} - -static inline int no_work(struct gdlm_ls *ls, int blocking) -{ - int ret; - - spin_lock(&ls->async_lock); - ret = list_empty(&ls->complete) && list_empty(&ls->submit); - if (ret && blocking) - ret = list_empty(&ls->blocking); - spin_unlock(&ls->async_lock); - - return ret; -} - -static inline int check_drop(struct gdlm_ls *ls) -{ - if (!ls->drop_locks_count) - return 0; - - if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) { - ls->drop_time = jiffies; - if (ls->all_locks_count >= ls->drop_locks_count) - return 1; - } - return 0; -} - -static int gdlm_thread(void *data, int blist) -{ - struct gdlm_ls *ls = (struct gdlm_ls *) data; - struct gdlm_lock *lp = NULL; - uint8_t complete, blocking, submit, drop; - - /* Only thread1 is allowed to do blocking callbacks since gfs - may wait for a completion callback within a blocking cb. */ - - while (!kthread_should_stop()) { - wait_event_interruptible(ls->thread_wait, - !no_work(ls, blist) || kthread_should_stop()); - - complete = blocking = submit = drop = 0; - - spin_lock(&ls->async_lock); - - if (blist && !list_empty(&ls->blocking)) { - lp = list_entry(ls->blocking.next, struct gdlm_lock, - blist); - list_del_init(&lp->blist); - blocking = lp->bast_mode; - lp->bast_mode = 0; - } else if (!list_empty(&ls->complete)) { - lp = list_entry(ls->complete.next, struct gdlm_lock, - clist); - list_del_init(&lp->clist); - complete = 1; - } else if (!list_empty(&ls->submit)) { - lp = list_entry(ls->submit.next, struct gdlm_lock, - delay_list); - list_del_init(&lp->delay_list); - submit = 1; - } - - drop = check_drop(ls); - spin_unlock(&ls->async_lock); - - if (complete) - process_complete(lp); - - else if (blocking) - process_blocking(lp, blocking); - - else if (submit) - gdlm_do_lock(lp); - - if (drop) - ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL); - - schedule(); - } - - return 0; -} - -static int gdlm_thread1(void *data) -{ - return gdlm_thread(data, 1); -} - -static int gdlm_thread2(void *data) -{ - return gdlm_thread(data, 0); -} - -int gdlm_init_threads(struct gdlm_ls *ls) -{ - struct task_struct *p; - int error; - - p = kthread_run(gdlm_thread1, ls, "lock_dlm1"); - error = IS_ERR(p); - if (error) { - log_error("can't start lock_dlm1 thread %d", error); - return error; - } - ls->thread1 = p; - - p = kthread_run(gdlm_thread2, ls, "lock_dlm2"); - error = IS_ERR(p); - if (error) { - log_error("can't start lock_dlm2 thread %d", error); - kthread_stop(ls->thread1); - return error; - } - ls->thread2 = p; - - return 0; -} - -void gdlm_release_threads(struct gdlm_ls *ls) -{ - kthread_stop(ls->thread1); - kthread_stop(ls->thread2); -} - diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile deleted file mode 100644 index 35e9730bc3a..00000000000 --- a/fs/gfs2/locking/nolock/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o -lock_nolock-y := main.o - diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c deleted file mode 100644 index d3b8ce6fbbe..00000000000 --- a/fs/gfs2/locking/nolock/main.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/lm_interface.h> - -struct nolock_lockspace { - unsigned int nl_lvb_size; -}; - -static const struct lm_lockops nolock_ops; - -static int nolock_mount(char *table_name, char *host_data, - lm_callback_t cb, void *cb_data, - unsigned int min_lvb_size, int flags, - struct lm_lockstruct *lockstruct, - struct kobject *fskobj) -{ - char *c; - unsigned int jid; - struct nolock_lockspace *nl; - - c = strstr(host_data, "jid="); - if (!c) - jid = 0; - else { - c += 4; - sscanf(c, "%u", &jid); - } - - nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); - if (!nl) - return -ENOMEM; - - nl->nl_lvb_size = min_lvb_size; - - lockstruct->ls_jid = jid; - lockstruct->ls_first = 1; - lockstruct->ls_lvb_size = min_lvb_size; - lockstruct->ls_lockspace = nl; - lockstruct->ls_ops = &nolock_ops; - lockstruct->ls_flags = LM_LSFLAG_LOCAL; - - return 0; -} - -static void nolock_others_may_mount(void *lockspace) -{ -} - -static void nolock_unmount(void *lockspace) -{ - struct nolock_lockspace *nl = lockspace; - kfree(nl); -} - -static void nolock_withdraw(void *lockspace) -{ -} - -/** - * nolock_get_lock - get a lm_lock_t given a descripton of the lock - * @lockspace: the lockspace the lock lives in - * @name: the name of the lock - * @lockp: return the lm_lock_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_get_lock(void *lockspace, struct lm_lockname *name, - void **lockp) -{ - *lockp = lockspace; - return 0; -} - -/** - * nolock_put_lock - get rid of a lock structure - * @lock: the lock to throw away - * - */ - -static void nolock_put_lock(void *lock) -{ -} - -/** - * nolock_lock - acquire a lock - * @lock: the lock to manipulate - * @cur_state: the current state - * @req_state: the requested state - * @flags: modifier flags - * - * Returns: A bitmap of LM_OUT_* - */ - -static unsigned int nolock_lock(void *lock, unsigned int cur_state, - unsigned int req_state, unsigned int flags) -{ - return req_state | LM_OUT_CACHEABLE; -} - -/** - * nolock_unlock - unlock a lock - * @lock: the lock to manipulate - * @cur_state: the current state - * - * Returns: 0 - */ - -static unsigned int nolock_unlock(void *lock, unsigned int cur_state) -{ - return 0; -} - -static void nolock_cancel(void *lock) -{ -} - -/** - * nolock_hold_lvb - hold on to a lock value block - * @lock: the lock the LVB is associated with - * @lvbp: return the lm_lvb_t here - * - * Returns: 0 on success, -EXXX on failure - */ - -static int nolock_hold_lvb(void *lock, char **lvbp) -{ - struct nolock_lockspace *nl = lock; - int error = 0; - - *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); - if (!*lvbp) - error = -ENOMEM; - - return error; -} - -/** - * nolock_unhold_lvb - release a LVB - * @lock: the lock the LVB is associated with - * @lvb: the lock value block - * - */ - -static void nolock_unhold_lvb(void *lock, char *lvb) -{ - kfree(lvb); -} - -static int nolock_plock_get(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - posix_test_lock(file, fl); - - return 0; -} - -static int nolock_plock(void *lockspace, struct lm_lockname *name, - struct file *file, int cmd, struct file_lock *fl) -{ - int error; - error = posix_lock_file_wait(file, fl); - return error; -} - -static int nolock_punlock(void *lockspace, struct lm_lockname *name, - struct file *file, struct file_lock *fl) -{ - int error; - error = posix_lock_file_wait(file, fl); - return error; -} - -static void nolock_recovery_done(void *lockspace, unsigned int jid, - unsigned int message) -{ -} - -static const struct lm_lockops nolock_ops = { - .lm_proto_name = "lock_nolock", - .lm_mount = nolock_mount, - .lm_others_may_mount = nolock_others_may_mount, - .lm_unmount = nolock_unmount, - .lm_withdraw = nolock_withdraw, - .lm_get_lock = nolock_get_lock, - .lm_put_lock = nolock_put_lock, - .lm_lock = nolock_lock, - .lm_unlock = nolock_unlock, - .lm_cancel = nolock_cancel, - .lm_hold_lvb = nolock_hold_lvb, - .lm_unhold_lvb = nolock_unhold_lvb, - .lm_plock_get = nolock_plock_get, - .lm_plock = nolock_plock, - .lm_punlock = nolock_punlock, - .lm_recovery_done = nolock_recovery_done, - .lm_owner = THIS_MODULE, -}; - -static int __init init_nolock(void) -{ - int error; - - error = gfs2_register_lockproto(&nolock_ops); - if (error) { - printk(KERN_WARNING - "lock_nolock: can't register protocol: %d\n", error); - return error; - } - - printk(KERN_INFO - "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__); - return 0; -} - -static void __exit exit_nolock(void) -{ - gfs2_unregister_lockproto(&nolock_ops); -} - -module_init(init_nolock); -module_exit(exit_nolock); - -MODULE_DESCRIPTION("GFS Nolock Locking Module"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 161ab6f2058..3966fadbceb 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -14,10 +14,13 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -28,8 +31,7 @@ #include "meta_io.h" #include "util.h" #include "dir.h" - -#define PULL 1 +#include "trace_gfs2.h" /** * gfs2_struct2blk - compute stuff @@ -66,13 +68,13 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, * @mapping: The associated mapping (maybe NULL) * @bd: The gfs2_bufdata to remove * - * The log lock _must_ be held when calling this function + * The ail lock _must_ be held when calling this function * */ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { - bd->bd_ail = NULL; + bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); @@ -82,53 +84,100 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) /** * gfs2_ail1_start_one - Start I/O on a part of the AIL * @sdp: the filesystem - * @tr: the part of the AIL + * @wbc: The writeback control structure + * @ai: The ail structure * */ -static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, + struct writeback_control *wbc, + struct gfs2_trans *tr) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) { + struct gfs2_glock *gl = NULL; + struct address_space *mapping; struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - int retry; - do { - retry = 0; + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, - bd_ail_st_list) { - bh = bd->bd_bh; + gfs2_assert(sdp, bd->bd_tr == tr); - gfs2_assert(sdp, bd->bd_ail == ai); + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); + continue; + } - if (!buffer_busy(bh)) { - if (!buffer_uptodate(bh)) - gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); - continue; - } + if (!buffer_dirty(bh)) + continue; + if (gl == bd->bd_gl) + continue; + gl = bd->bd_gl; + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); + mapping = bh->b_page->mapping; + if (!mapping) + continue; + spin_unlock(&sdp->sd_ail_lock); + generic_writepages(mapping, wbc); + spin_lock(&sdp->sd_ail_lock); + if (wbc->nr_to_write <= 0) + break; + return 1; + } - if (!buffer_dirty(bh)) - continue; + return 0; +} - list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); - get_bh(bh); - gfs2_log_unlock(sdp); - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - gfs2_log_lock(sdp); +/** + * gfs2_ail1_flush - start writeback of some ail1 entries + * @sdp: The super block + * @wbc: The writeback control structure + * + * Writes back some ail1 entries, according to the limits in the + * writeback control structure + */ - retry = 1; +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) +{ + struct list_head *head = &sdp->sd_ail1_list; + struct gfs2_trans *tr; + struct blk_plug plug; + + trace_gfs2_ail_flush(sdp, wbc, 1); + blk_start_plug(&plug); + spin_lock(&sdp->sd_ail_lock); +restart: + list_for_each_entry_reverse(tr, head, tr_list) { + if (wbc->nr_to_write <= 0) break; - } - } while (retry); + if (gfs2_ail1_start_one(sdp, wbc, tr)) + goto restart; + } + spin_unlock(&sdp->sd_ail_lock); + blk_finish_plug(&plug); + trace_gfs2_ail_flush(sdp, wbc, 0); +} + +/** + * gfs2_ail1_start - start writeback of all ail1 entries + * @sdp: The superblock + */ + +static void gfs2_ail1_start(struct gfs2_sbd *sdp) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return gfs2_ail1_flush(sdp, &wbc); } /** @@ -138,97 +187,72 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) * */ -static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - - gfs2_assert(sdp, bd->bd_ail == ai); - - if (buffer_busy(bh)) { - if (flags & DIO_ALL) - continue; - else - break; - } - + gfs2_assert(sdp, bd->bd_tr == tr); + if (buffer_busy(bh)) + continue; if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } - return list_empty(&ai->ai_ail1_list); } -static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) -{ - struct list_head *head; - u64 sync_gen; - struct list_head *first; - struct gfs2_ail *first_ai, *ai, *tmp; - int done = 0; - - gfs2_log_lock(sdp); - head = &sdp->sd_ail1_list; - if (list_empty(head)) { - gfs2_log_unlock(sdp); - return; - } - sync_gen = sdp->sd_ail_sync_gen++; - - first = head->prev; - first_ai = list_entry(first, struct gfs2_ail, ai_list); - first_ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */ - - if (flags & DIO_ALL) - first = NULL; - - while(!done) { - if (first && (head->prev != first || - gfs2_ail1_empty_one(sdp, first_ai, 0))) - break; - - done = 1; - list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { - if (ai->ai_sync_gen >= sync_gen) - continue; - ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ - done = 0; - break; - } - } - - gfs2_log_unlock(sdp); -} +/** + * gfs2_ail1_empty - Try to empty the ail1 lists + * @sdp: The superblock + * + * Tries to empty the ail1 lists, starting with the oldest first + */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai, *s; + struct gfs2_trans *tr, *s; + int oldest_tr = 1; int ret; - gfs2_log_lock(sdp); - - list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { - if (gfs2_ail1_empty_one(sdp, ai, flags)) - list_move(&ai->ai_list, &sdp->sd_ail2_list); - else if (!(flags & DIO_ALL)) - break; + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + gfs2_ail1_empty_one(sdp, tr); + if (list_empty(&tr->tr_ail1_list) && oldest_tr) + list_move(&tr->tr_list, &sdp->sd_ail2_list); + else + oldest_tr = 0; } - ret = list_empty(&sdp->sd_ail1_list); - - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return ret; } +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} /** * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced @@ -237,43 +261,60 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) * */ -static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &ai->ai_ail2_list; + struct list_head *head = &tr->tr_ail2_list; struct gfs2_bufdata *bd; while (!list_empty(head)) { bd = list_entry(head->prev, struct gfs2_bufdata, bd_ail_st_list); - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_ail *ai, *safe; + struct gfs2_trans *tr, *safe; unsigned int old_tail = sdp->sd_log_tail; int wrap = (new_tail < old_tail); int a, b, rm; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { - a = (old_tail <= ai->ai_first); - b = (ai->ai_first < new_tail); + list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { + a = (old_tail <= tr->tr_first); + b = (tr->tr_first < new_tail); rm = (wrap) ? (a || b) : (a && b); if (!rm) continue; - gfs2_ail2_empty_one(sdp, ai); - list_del(&ai->ai_list); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); - kfree(ai); + gfs2_ail2_empty_one(sdp, tr); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + kfree(tr); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); } /** @@ -287,66 +328,58 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * flush time, so we ensure that we have just enough free blocks at all * times to avoid running out during a log flush. * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * * Returns: errno */ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned int try = 0; - unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; - - mutex_lock(&sdp->sd_log_reserve_mutex); - gfs2_log_lock(sdp); - while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { - gfs2_log_unlock(sdp); - gfs2_ail1_empty(sdp, 0); - gfs2_log_flush(sdp, NULL); - - if (try++) - gfs2_ail1_start(sdp, 0); - gfs2_log_lock(sdp); +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); } - atomic_sub(blks, &sdp->sd_log_blks_free); - gfs2_log_unlock(sdp); - mutex_unlock(&sdp->sd_log_reserve_mutex); + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) + goto retry; + trace_gfs2_log_blocks(sdp, -blks); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); - - return 0; -} - -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - gfs2_log_lock(sdp); - atomic_add(blks, &sdp->sd_log_blks_free); - gfs2_assert_withdraw(sdp, - atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); - up_read(&sdp->sd_log_flush_lock); -} - -static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) -{ - struct gfs2_journal_extent *je; - - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { - if (lbn >= je->lblock && lbn < je->lblock + je->blocks) - return je->dblock + lbn - je->lblock; + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release(sdp, blks); + return -EROFS; } - - return -1; + return 0; } /** @@ -401,24 +434,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer static unsigned int calc_reserved(struct gfs2_sbd *sdp) { unsigned int reserved = 0; - unsigned int mbuf_limit, metabufhdrs_needed; - unsigned int dbuf_limit, databufhdrs_needed; - unsigned int revokes = 0; - - mbuf_limit = buf_limit(sdp); - metabufhdrs_needed = (sdp->sd_log_commited_buf + - (mbuf_limit - 1)) / mbuf_limit; - dbuf_limit = databuf_limit(sdp); - databufhdrs_needed = (sdp->sd_log_commited_databuf + - (dbuf_limit - 1)) / dbuf_limit; - - if (sdp->sd_log_commited_revoke) - revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, - sizeof(u64)); + unsigned int mbuf; + unsigned int dbuf; + struct gfs2_trans *tr = sdp->sd_log_tr; + + if (tr) { + mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved = mbuf + dbuf; + /* Account for header blocks */ + reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); + reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + } - reserved = sdp->sd_log_commited_buf + metabufhdrs_needed + - sdp->sd_log_commited_databuf + databufhdrs_needed + - revokes; + if (sdp->sd_log_commited_revoke > 0) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(u64)); /* One for the overall header */ if (reserved) reserved++; @@ -427,139 +458,184 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) static unsigned int current_tail(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; unsigned int tail; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; } else { - ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); - tail = ai->ai_first; + tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr_list); + tail = tr->tr_first; } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return tail; } -void gfs2_log_incr_head(struct gfs2_sbd *sdp) +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) { - if (sdp->sd_log_flush_head == sdp->sd_log_tail) - BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head); + unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); - if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { - sdp->sd_log_flush_head = 0; - sdp->sd_log_flush_wrapped = 1; - } + ail2_empty(sdp, new_tail); + + atomic_add(dist, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, dist); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + + sdp->sd_log_tail = new_tail; } -/** - * gfs2_log_write_endio - End of I/O for a log buffer - * @bh: The buffer head - * @uptodate: I/O Status - * - */ -static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) +static void log_flush_wait(struct gfs2_sbd *sdp) { - struct gfs2_sbd *sdp = bh->b_private; - bh->b_private = NULL; + DEFINE_WAIT(wait); - end_buffer_write_sync(bh, uptodate); - if (atomic_dec_and_test(&sdp->sd_log_in_flight)) - wake_up(&sdp->sd_log_flush_wait); + if (atomic_read(&sdp->sd_log_in_flight)) { + do { + prepare_to_wait(&sdp->sd_log_flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_in_flight)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_in_flight)); + finish_wait(&sdp->sd_log_flush_wait, &wait); + } } -/** - * gfs2_log_get_buf - Get and initialize a buffer to use for log control data - * @sdp: The GFS2 superblock - * - * Returns: the buffer_head - */ - -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) { - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; + struct gfs2_inode *ipa, *ipb; - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); - bh->b_private = sdp; - bh->b_end_io = gfs2_log_write_endio; + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); - return bh; + if (ipa->i_no_addr < ipb->i_no_addr) + return -1; + if (ipa->i_no_addr > ipb->i_no_addr) + return 1; + return 0; } -/** - * gfs2_fake_write_endio - - * @bh: The buffer head - * @uptodate: The I/O Status - * - */ - -static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) +static void gfs2_ordered_write(struct gfs2_sbd *sdp) { - struct buffer_head *real_bh = bh->b_private; - struct gfs2_bufdata *bd = real_bh->b_private; - struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; + struct gfs2_inode *ip; + LIST_HEAD(written); - end_buffer_write_sync(bh, uptodate); - free_buffer_head(bh); - unlock_buffer(real_bh); - brelse(real_bh); - if (atomic_dec_and_test(&sdp->sd_log_in_flight)) - wake_up(&sdp->sd_log_flush_wait); + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); + while (!list_empty(&sdp->sd_log_le_ordered)) { + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_move(&ip->i_ordered, &written); + if (ip->i_inode.i_mapping->nrpages == 0) + continue; + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + list_splice(&written, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); } -/** - * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log - * @sdp: the filesystem - * @data: the data the buffer_head should point to - * - * Returns: the log buffer descriptor - */ - -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, - struct buffer_head *real) +static void gfs2_ordered_wait(struct gfs2_sbd *sdp) { - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; + struct gfs2_inode *ip; - bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); - atomic_set(&bh->b_count, 1); - bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); - set_bh_page(bh, real->b_page, bh_offset(real)); - bh->b_blocknr = blkno; - bh->b_size = sdp->sd_sb.sb_bsize; - bh->b_bdev = sdp->sd_vfs->s_bdev; - bh->b_private = real; - bh->b_end_io = gfs2_fake_write_endio; + spin_lock(&sdp->sd_ordered_lock); + while (!list_empty(&sdp->sd_log_le_ordered)) { + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_del(&ip->i_ordered); + WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + if (ip->i_inode.i_mapping->nrpages == 0) + continue; + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + spin_unlock(&sdp->sd_ordered_lock); +} - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - return bh; + spin_lock(&sdp->sd_ordered_lock); + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + list_del(&ip->i_ordered); + spin_unlock(&sdp->sd_ordered_lock); } -static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + struct buffer_head *bh = bd->bd_bh; + struct gfs2_glock *gl = bd->bd_gl; - ail2_empty(sdp, new_tail); + bh->b_private = NULL; + bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; + bd->bd_ops = &gfs2_revoke_lops; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); +} +void gfs2_write_revokes(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd, *tmp; + int have_revokes = 0; + int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + + gfs2_ail1_empty(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { + if (list_empty(&bd->bd_list)) { + have_revokes = 1; + goto done; + } + } + } +done: + spin_unlock(&sdp->sd_ail_lock); + if (have_revokes == 0) + return; + while (sdp->sd_log_num_revoke > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + max_revokes -= sdp->sd_log_num_revoke; + if (!sdp->sd_log_num_revoke) { + atomic_dec(&sdp->sd_log_blks_free); + /* If no blocks have been reserved, we need to also + * reserve a block for the header */ + if (!sdp->sd_log_blks_reserved) + atomic_dec(&sdp->sd_log_blks_free); + } gfs2_log_lock(sdp); - atomic_add(dist, &sdp->sd_log_blks_free); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { + if (max_revokes == 0) + goto out_of_blocks; + if (!list_empty(&bd->bd_list)) + continue; + gfs2_add_revoke(sdp, bd); + max_revokes--; + } + } +out_of_blocks: + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); - sdp->sd_log_tail = new_tail; + if (!sdp->sd_log_num_revoke) { + atomic_inc(&sdp->sd_log_blks_free); + if (!sdp->sd_log_blks_reserved) + atomic_inc(&sdp->sd_log_blks_free); + } } /** @@ -569,116 +645,43 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) * Returns: the initialized log buffer descriptor */ -static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; struct gfs2_log_header *lh; unsigned int tail; u32 hash; + int rw = WRITE_FLUSH_FUA | REQ_META; + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + lh = page_address(page); + clear_page(lh); - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - unlock_buffer(bh); - - gfs2_ail1_empty(sdp, 0); tail = current_tail(sdp); - lh = (struct gfs2_log_header *)bh->b_data; - memset(lh, 0, sizeof(struct gfs2_log_header)); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); lh->lh_flags = cpu_to_be32(flags); lh->lh_tail = cpu_to_be32(tail); lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); - hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header)); lh->lh_hash = cpu_to_be32(hash); - set_buffer_dirty(bh); - if (sync_dirty_buffer(bh)) - gfs2_io_error_bh(sdp, bh); - brelse(bh); - - if (sdp->sd_log_tail != tail) - log_pull_tail(sdp, tail); - else - gfs2_assert_withdraw(sdp, !pull); - - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_log_incr_head(sdp); -} - -static void log_flush_commit(struct gfs2_sbd *sdp) -{ - DEFINE_WAIT(wait); - - if (atomic_read(&sdp->sd_log_in_flight)) { - do { - prepare_to_wait(&sdp->sd_log_flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&sdp->sd_log_in_flight)) - io_schedule(); - } while(atomic_read(&sdp->sd_log_in_flight)); - finish_wait(&sdp->sd_log_flush_wait, &wait); + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { + gfs2_ordered_wait(sdp); + log_flush_wait(sdp); + rw = WRITE_SYNC | REQ_META | REQ_PRIO; } - log_write_header(sdp, 0, 0); -} - -static void gfs2_ordered_write(struct gfs2_sbd *sdp) -{ - struct gfs2_bufdata *bd; - struct buffer_head *bh; - LIST_HEAD(written); - - gfs2_log_lock(sdp); - while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list); - list_move(&bd->bd_le.le_list, &written); - bh = bd->bd_bh; - if (!buffer_dirty(bh)) - continue; - get_bh(bh); - gfs2_log_unlock(sdp); - lock_buffer(bh); - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - gfs2_log_lock(sdp); - } - list_splice(&written, &sdp->sd_log_le_ordered); - gfs2_log_unlock(sdp); -} - -static void gfs2_ordered_wait(struct gfs2_sbd *sdp) -{ - struct gfs2_bufdata *bd; - struct buffer_head *bh; + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + gfs2_log_write_page(sdp, page); + gfs2_log_flush_bio(sdp, rw); + log_flush_wait(sdp); - gfs2_log_lock(sdp); - while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list); - bh = bd->bd_bh; - if (buffer_locked(bh)) { - get_bh(bh); - gfs2_log_unlock(sdp); - wait_on_buffer(bh); - brelse(bh); - gfs2_log_lock(sdp); - continue; - } - list_del_init(&bd->bd_le.le_list); - } - gfs2_log_unlock(sdp); + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail); } /** @@ -688,9 +691,10 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * */ -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; down_write(&sdp->sd_log_flush_lock); @@ -699,79 +703,133 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) up_write(&sdp->sd_log_flush_lock); return; } + trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); - INIT_LIST_HEAD(&ai->ai_ail1_list); - INIT_LIST_HEAD(&ai->ai_ail2_list); - - if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { - printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, - sdp->sd_log_commited_buf); - gfs2_assert_withdraw(sdp, 0); - } - if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) { - printk(KERN_INFO "GFS2: log databuf %u %u\n", - sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf); - gfs2_assert_withdraw(sdp, 0); + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + tr = sdp->sd_log_tr; + if (tr) { + sdp->sd_log_tr = NULL; + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + tr->tr_first = sdp->sd_log_flush_head; } + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); - sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; - ai->ai_first = sdp->sd_log_flush_head; - gfs2_ordered_write(sdp); - lops_before_commit(sdp); - gfs2_ordered_wait(sdp); + lops_before_commit(sdp, tr); + gfs2_log_flush_bio(sdp, WRITE); - if (sdp->sd_log_head != sdp->sd_log_flush_head) - log_flush_commit(sdp); - else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ - gfs2_log_lock(sdp); + if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_flush_wait(sdp); + log_write_header(sdp, 0); + } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ - gfs2_log_unlock(sdp); - log_write_header(sdp, 0, PULL); + trace_gfs2_log_blocks(sdp, -1); + log_write_header(sdp, 0); } - lops_after_commit(sdp, ai); + lops_after_commit(sdp, tr); gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_buf = 0; - sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; - if (!list_empty(&ai->ai_ail1_list)) { - list_add(&ai->ai_list, &sdp->sd_ail1_list); - ai = NULL; + spin_lock(&sdp->sd_ail_lock); + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; } + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); - sdp->sd_vfs->s_dirt = 0; + if (atomic_read(&sdp->sd_log_freeze)) + type = FREEZE_FLUSH; + if (type != NORMAL_FLUSH) { + if (!sdp->sd_log_idle) { + for (;;) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) + break; + } + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + sdp->sd_log_flush_wrapped = 0; + log_write_header(sdp, 0); + sdp->sd_log_head = sdp->sd_log_flush_head; + } + if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) + gfs2_log_shutdown(sdp); + if (type == FREEZE_FLUSH) { + int error; + + atomic_set(&sdp->sd_log_freeze, 0); + wake_up(&sdp->sd_log_frozen_wait); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, + LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + else + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); + } + } + + trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); - kfree(ai); + kfree(tr); +} + +/** + * gfs2_merge_trans - Merge a new transaction into a cached transaction + * @old: Original transaction to be expanded + * @new: New transaction to be merged + */ + +static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +{ + WARN_ON_ONCE(old->tr_attached != 1); + + old->tr_num_buf_new += new->tr_num_buf_new; + old->tr_num_databuf_new += new->tr_num_databuf_new; + old->tr_num_buf_rm += new->tr_num_buf_rm; + old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; + + list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); + list_splice_tail_init(&new->tr_buf, &old->tr_buf); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int reserved; unsigned int unused; + unsigned int maxres; gfs2_log_lock(sdp); - sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; - sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - - tr->tr_num_databuf_rm; - gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || - (((int)sdp->sd_log_commited_databuf) >= 0)); + if (sdp->sd_log_tr) { + gfs2_merge_trans(sdp->sd_log_tr, tr); + } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { + gfs2_assert_withdraw(sdp, tr->tr_alloced); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; - gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); reserved = calc_reserved(sdp); - unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; - gfs2_assert_withdraw(sdp, unused >= 0); + maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, maxres >= reserved); + unused = maxres - reserved; atomic_add(unused, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, unused); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; @@ -784,21 +842,24 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * @sdp: the filesystem * @tr: the transaction * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * * Returns: errno */ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - lops_incore_commit(sdp, tr); - sdp->sd_vfs->s_dirt = 1; - up_read(&sdp->sd_log_flush_lock); - - gfs2_log_lock(sdp); - if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) - wake_up_process(sdp->sd_logd_process); - gfs2_log_unlock(sdp); + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); } /** @@ -809,49 +870,32 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_shutdown(struct gfs2_sbd *sdp) { - down_write(&sdp->sd_log_flush_lock); - gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, - (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_tail = sdp->sd_log_head; - - up_write(&sdp->sd_log_flush_lock); } - -/** - * gfs2_meta_syncfs - sync all the buffers in a filesystem - * @sdp: the filesystem - * - */ - -void gfs2_meta_syncfs(struct gfs2_sbd *sdp) +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - gfs2_log_flush(sdp, NULL); - for (;;) { - gfs2_ail1_start(sdp, DIO_ALL); - if (gfs2_ail1_empty(sdp, DIO_ALL)) - break; - msleep(10); - } + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze)); } +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks @@ -864,28 +908,41 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; - unsigned long t; - int need_flush; + unsigned long t = 1; + DEFINE_WAIT(wait); while (!kthread_should_stop()) { - /* Advance the log tail */ - - t = sdp->sd_log_flush_time + - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; - - gfs2_ail1_empty(sdp, DIO_ALL); - gfs2_log_lock(sdp); - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); - gfs2_log_unlock(sdp); - if (need_flush || time_after_eq(jiffies, t)) { - gfs2_log_flush(sdp, NULL); - sdp->sd_log_flush_time = jiffies; + + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + } + + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + gfs2_ail1_empty(sdp); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } + if (!gfs2_ail_flush_reqd(sdp)) + wake_up(&sdp->sd_log_waitq); + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); + + try_to_freeze(); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); } return 0; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 77115281650..9499a604921 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -12,6 +12,7 @@ #include <linux/list.h> #include <linux/spinlock.h> +#include <linux/writeback.h> #include "incore.h" /** @@ -21,6 +22,7 @@ */ static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +__acquires(&sdp->sd_log_lock) { spin_lock(&sdp->sd_log_lock); } @@ -32,6 +34,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp) */ static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +__releases(&sdp->sd_log_lock) { spin_unlock(&sdp->sd_log_lock); } @@ -45,29 +48,38 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize); - -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_incr_head(struct gfs2_sbd *sdp); - -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, - struct buffer_head *real); -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); - -static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { - if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) - __gfs2_log_flush(sbd, gl); + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + spin_lock(&sdp->sd_ordered_lock); + if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } } +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize); -void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +enum gfs2_flush_type { + NORMAL_FLUSH = 0, + SYNC_FLUSH, + SHUTDOWN_FLUSH, + FREEZE_FLUSH +}; +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -void gfs2_log_shutdown(struct gfs2_sbd *sdp); -void gfs2_meta_syncfs(struct gfs2_sbd *sdp); -int gfs2_logd(void *data); +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); +extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index fae59d69d01..2c1ae861dc9 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -12,8 +12,11 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/mempool.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/bio.h> +#include <linux/fs.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -26,6 +29,7 @@ #include "rgrp.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" /** * gfs2_pin - Pin a buffer in memory @@ -34,11 +38,11 @@ * * The log lock must be held when calling this function */ -static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; - gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + BUG_ON(!current->journal_info); clear_buffer_dirty(bh); if (test_set_buffer_pinned(bh)) @@ -49,9 +53,37 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) /* If this buffer is in the AIL and it has already been written * to in-place disk block, remove it from the AIL. */ - if (bd->bd_ail) - list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); + trace_gfs2_pin(bd, 1); +} + +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_rgrpd *rgd = gl->gl_object; + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + if (bi->bi_clone == NULL) + return; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@ -59,25 +91,27 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) * @sdp: the filesystem the buffer belongs to * @bh: The buffer to unpin * @ai: + * @flags: The inode dirty flags * */ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct gfs2_ail *ai) + struct gfs2_trans *tr) { struct gfs2_bufdata *bd = bh->b_private; - gfs2_assert_withdraw(sdp, buffer_uptodate(bh)); - - if (!buffer_pinned(bh)) - gfs2_assert_withdraw(sdp, 0); + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!buffer_pinned(bh)); lock_buffer(bh); mark_buffer_dirty(bh); clear_buffer_pinned(bh); - gfs2_log_lock(sdp); - if (bd->bd_ail) { + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) { list_del(&bd->bd_ail_st_list); brelse(bh); } else { @@ -85,137 +119,367 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); atomic_inc(&gl->gl_ail_count); } - bd->bd_ail = ai; - list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - gfs2_log_unlock(sdp); + trace_gfs2_pin(bd, 0); unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); +} + +static void gfs2_log_incr_head(struct gfs2_sbd *sdp) +{ + BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && + (sdp->sd_log_flush_head != sdp->sd_log_head)); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } } +static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +{ + unsigned int lbn = sdp->sd_log_flush_head; + struct gfs2_journal_extent *je; + u64 block; + + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { + if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { + block = je->dblock + lbn - je->lblock; + gfs2_log_incr_head(sdp); + return block; + } + } + + return -1; +} + +/** + * gfs2_end_log_write_bh - end log write of pagecache data with buffers + * @sdp: The superblock + * @bvec: The bio_vec + * @error: The i/o status + * + * This finds the relavent buffers and unlocks then and sets the + * error flag according to the status of the i/o request. This is + * used when the log is writing data which has an in-place version + * that is pinned in the pagecache. + */ -static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh) +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, + int error) { - return (struct gfs2_log_descriptor *)bh->b_data; + struct buffer_head *bh, *next; + struct page *page = bvec->bv_page; + unsigned size; + + bh = page_buffers(page); + size = bvec->bv_len; + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + if (error) + set_buffer_write_io_error(bh); + unlock_buffer(bh); + next = bh->b_this_page; + size -= bh->b_size; + brelse(bh); + bh = next; + } while(bh && size); } -static inline __be64 *bh_log_ptr(struct buffer_head *bh) +/** + * gfs2_end_log_write - end of i/o to the log + * @bio: The bio + * @error: Status of i/o request + * + * Each bio_vec contains either data from the pagecache or data + * relating to the log itself. Here we iterate over the bio_vec + * array, processing both kinds of data. + * + */ + +static void gfs2_end_log_write(struct bio *bio, int error) { - struct gfs2_log_descriptor *ld = bh_log_desc(bh); - return (__force __be64 *)(ld + 1); + struct gfs2_sbd *sdp = bio->bi_private; + struct bio_vec *bvec; + struct page *page; + int i; + + if (error) { + sdp->sd_log_error = error; + fs_err(sdp, "Error %d writing to log\n", error); + } + + bio_for_each_segment_all(bvec, bio, i) { + page = bvec->bv_page; + if (page_has_buffers(page)) + gfs2_end_log_write_bh(sdp, bvec, error); + else + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); } -static inline __be64 *bh_ptr_end(struct buffer_head *bh) +/** + * gfs2_log_flush_bio - Submit any pending log bio + * @sdp: The superblock + * @rw: The rw flags + * + * Submit any pending part-built or full bio to the block device. If + * there is no pending bio, then this is a no-op. + */ + +void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) { - return (__force __be64 *)(bh->b_data + bh->b_size); + if (sdp->sd_log_bio) { + atomic_inc(&sdp->sd_log_in_flight); + submit_bio(rw, sdp->sd_log_bio); + sdp->sd_log_bio = NULL; + } } +/** + * gfs2_log_alloc_bio - Allocate a new bio for log writing + * @sdp: The superblock + * @blkno: The next device block number we want to write to + * + * This should never be called when there is a cached bio in the + * super block. When it returns, there will be a cached bio in the + * super block which will have as many bio_vecs as the device is + * happy to handle. + * + * Returns: Newly allocated bio + */ -static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) { - struct buffer_head *bh = gfs2_log_get_buf(sdp); - struct gfs2_log_descriptor *ld = bh_log_desc(bh); + struct super_block *sb = sdp->sd_vfs; + unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); + struct bio *bio; + + BUG_ON(sdp->sd_log_bio); + + while (1) { + bio = bio_alloc(GFP_NOIO, nrvecs); + if (likely(bio)) + break; + nrvecs = max(nrvecs/2, 1U); + } + + bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio->bi_end_io = gfs2_end_log_write; + bio->bi_private = sdp; + + sdp->sd_log_bio = bio; + + return bio; +} + +/** + * gfs2_log_get_bio - Get cached log bio, or allocate a new one + * @sdp: The superblock + * @blkno: The device block number we want to write to + * + * If there is a cached bio, then if the next block number is sequential + * with the previous one, return it, otherwise flush the bio to the + * device. If there is not a cached bio, or we just flushed it, then + * allocate a new one. + * + * Returns: The bio to use for log writes + */ + +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +{ + struct bio *bio = sdp->sd_log_bio; + u64 nblk; + + if (bio) { + nblk = bio_end_sector(bio); + nblk >>= sdp->sd_fsb2bb_shift; + if (blkno == nblk) + return bio; + gfs2_log_flush_bio(sdp, WRITE); + } + + return gfs2_log_alloc_bio(sdp, blkno); +} + + +/** + * gfs2_log_write - write to log + * @sdp: the filesystem + * @page: the page to write + * @size: the size of the data to write + * @offset: the offset within the page + * + * Try and add the page segment to the current bio. If that fails, + * submit the current bio to the device and create a new one, and + * then add the page segment to that. + */ + +static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, + unsigned size, unsigned offset) +{ + u64 blkno = gfs2_log_bmap(sdp); + struct bio *bio; + int ret; + + bio = gfs2_log_get_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + if (ret == 0) { + gfs2_log_flush_bio(sdp, WRITE); + bio = gfs2_log_alloc_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + WARN_ON(ret == 0); + } +} + +/** + * gfs2_log_write_bh - write a buffer's content to the log + * @sdp: The super block + * @bh: The buffer pointing to the in-place location + * + * This writes the content of the buffer to the next available location + * in the log. The buffer will be unlocked once the i/o to the log has + * completed. + */ + +static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/** + * gfs2_log_write_page - write one block stored in a page, into the log + * @sdp: The superblock + * @page: The struct page + * + * This writes the first block-sized part of the page into the log. Note + * that the page must have been allocated from the gfs2_page_pool mempool + * and that after this has been called, ownership has been transferred and + * the page may be freed at any time. + */ + +void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +{ + struct super_block *sb = sdp->sd_vfs; + gfs2_log_write(sdp, page, sb->s_blocksize, 0); +} + +static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, + u32 ld_length, u32 ld_data1) +{ + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct gfs2_log_descriptor *ld = page_address(page); + clear_page(ld); ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); ld->ld_type = cpu_to_be32(ld_type); - ld->ld_length = 0; - ld->ld_data1 = 0; + ld->ld_length = cpu_to_be32(ld_length); + ld->ld_data1 = cpu_to_be32(ld_data1); ld->ld_data2 = 0; - memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); - return bh; + return page; } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static void gfs2_check_magic(struct buffer_head *bh) { - struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); - struct gfs2_trans *tr; + void *kaddr; + __be32 *ptr; - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); - if (!list_empty(&bd->bd_list_tr)) - goto out; - tr = current->journal_info; - tr->tr_touched = 1; - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); - if (!list_empty(&le->le_list)) - goto out; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - gfs2_meta_check(sdp, bd->bd_bh); - gfs2_pin(sdp, bd->bd_bh); - sdp->sd_log_num_buf++; - list_add(&le->le_list, &sdp->sd_log_le_buf); - tr->tr_num_buf_new++; -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); + clear_buffer_escaped(bh); + kaddr = kmap_atomic(bh->b_page); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + set_buffer_escaped(bh); + kunmap_atomic(kaddr); } -static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) { - struct list_head *head = &tr->tr_list_buf; - struct gfs2_bufdata *bd; + struct gfs2_bufdata *bda, *bdb; - gfs2_log_lock(sdp); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); - list_del_init(&bd->bd_list_tr); - tr->tr_num_buf--; - } - gfs2_log_unlock(sdp); - gfs2_assert_warn(sdp, !tr->tr_num_buf); + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; } -static void buf_lo_before_commit(struct gfs2_sbd *sdp) +static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, + unsigned int total, struct list_head *blist, + bool is_databuf) { - struct buffer_head *bh; struct gfs2_log_descriptor *ld; struct gfs2_bufdata *bd1 = NULL, *bd2; - unsigned int total; - unsigned int limit; + struct page *page; unsigned int num; unsigned n; __be64 *ptr; - limit = buf_limit(sdp); - /* for 4k blocks, limit = 503 */ - gfs2_log_lock(sdp); - total = sdp->sd_log_num_buf; - bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); + list_sort(NULL, blist, blocknr_cmp); + bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); while(total) { num = total; if (total > limit) num = limit; gfs2_log_unlock(sdp); - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); + ld = page_address(page); gfs2_log_lock(sdp); - ld = bh_log_desc(bh); - ptr = bh_log_ptr(bh); - ld->ld_length = cpu_to_be32(num + 1); - ld->ld_data1 = cpu_to_be32(num); + ptr = (__be64 *)(ld + 1); n = 0; - list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, - bd_le.le_list) { + list_for_each_entry_continue(bd1, blist, bd_list) { *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (is_databuf) { + gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0); + } if (++n >= num) break; } gfs2_log_unlock(sdp); - submit_bh(WRITE, bh); + gfs2_log_write_page(sdp, page); gfs2_log_lock(sdp); n = 0; - list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, - bd_le.le_list) { + list_for_each_entry_continue(bd2, blist, bd_list) { get_bh(bd2->bd_bh); gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); - bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE, bh); + + if (buffer_escaped(bd2->bd_bh)) { + void *kaddr; + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + ptr = page_address(page); + kaddr = kmap_atomic(bd2->bd_bh->b_page); + memcpy(ptr, kaddr + bh_offset(bd2->bd_bh), + bd2->bd_bh->b_size); + kunmap_atomic(kaddr); + *(__be32 *)ptr = 0; + clear_buffer_escaped(bd2->bd_bh); + unlock_buffer(bd2->bd_bh); + brelse(bd2->bd_bh); + gfs2_log_write_page(sdp, page); + } else { + gfs2_log_write_bh(sdp, bd2->bd_bh); + } gfs2_log_lock(sdp); if (++n >= num) break; @@ -227,31 +491,40 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); } -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_buf; + unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0); +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head; struct gfs2_bufdata *bd; - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); - sdp->sd_log_num_buf--; + if (tr == NULL) + return; - gfs2_unpin(sdp, bd->bd_bh, ai); + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } static void buf_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_blocks = 0; - sdp->sd_replayed_blocks = 0; + jd->jd_found_blocks = 0; + jd->jd_replayed_blocks = 0; } static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -274,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { blkno = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -297,12 +570,34 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (error) break; - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; } +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * + */ + +static void gfs2_meta_sync(struct gfs2_glock *gl) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + int error; + + if (mapping == NULL) + mapping = &sdp->sd_aspace; + + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -318,75 +613,73 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - list_add(&le->le_list, &sdp->sd_log_le_revoke); -} - -static void revoke_lo_before_commit(struct gfs2_sbd *sdp) -{ - struct gfs2_log_descriptor *ld; struct gfs2_meta_header *mh; - struct buffer_head *bh; unsigned int offset; struct list_head *head = &sdp->sd_log_le_revoke; struct gfs2_bufdata *bd; + struct page *page; + unsigned int length; + gfs2_write_revokes(sdp); if (!sdp->sd_log_num_revoke) return; - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE); - ld = bh_log_desc(bh); - ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, - sizeof(u64))); - ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); offset = sizeof(struct gfs2_log_descriptor); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); + list_for_each_entry(bd, head, bd_list) { sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE, bh); - bh = gfs2_log_get_buf(sdp); - mh = (struct gfs2_meta_header *)bh->b_data; + gfs2_log_write_page(sdp, page); + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + mh = page_address(page); + clear_page(mh); mh->mh_magic = cpu_to_be32(GFS2_MAGIC); mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); offset = sizeof(struct gfs2_meta_header); } - *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno); - kmem_cache_free(gfs2_bufdata_cachep, bd); - + *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno); offset += sizeof(u64); } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE, bh); + gfs2_log_write_page(sdp, page); +} + +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gl = bd->bd_gl; + atomic_dec(&gl->gl_revokes); + clear_bit(GLF_LFLUSH, &gl->gl_flags); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_revokes = 0; - sdp->sd_replay_tail = head->lh_tail; + jd->jd_found_revokes = 0; + jd->jd_replay_tail = head->lh_tail; } static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -418,11 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); - error = gfs2_revoke_add(sdp, blkno, start); - if (error < 0) + error = gfs2_revoke_add(jd, blkno, start); + if (error < 0) { + brelse(bh); return error; + } else if (error) - sdp->sd_found_revokes++; + jd->jd_found_revokes++; if (!--revokes) break; @@ -442,170 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_revoke_clean(sdp); + gfs2_revoke_clean(jd); return; } if (pass != 1) return; fs_info(sdp, "jid=%u: Found %u revoke tags\n", - jd->jd_jid, sdp->sd_found_revokes); - - gfs2_revoke_clean(sdp); -} - -static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_rgrpd *rgd; - struct gfs2_trans *tr = current->journal_info; - - tr->tr_touched = 1; - - rgd = container_of(le, struct gfs2_rgrpd, rd_le); - - gfs2_log_lock(sdp); - if (!list_empty(&le->le_list)){ - gfs2_log_unlock(sdp); - return; - } - gfs2_rgrp_bh_hold(rgd); - sdp->sd_log_num_rg++; - list_add(&le->le_list, &sdp->sd_log_le_rg); - gfs2_log_unlock(sdp); -} - -static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -{ - struct list_head *head = &sdp->sd_log_le_rg; - struct gfs2_rgrpd *rgd; - - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); - list_del_init(&rgd->rd_le.le_list); - sdp->sd_log_num_rg--; - - gfs2_rgrp_repolish_clones(rgd); - gfs2_rgrp_bh_put(rgd); - } - gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); -} - -/** - * databuf_lo_add - Add a databuf to the transaction. - * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. - */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); - struct gfs2_trans *tr = current->journal_info; - struct address_space *mapping = bd->bd_bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - - lock_buffer(bd->bd_bh); - gfs2_log_lock(sdp); - if (tr) { - if (!list_empty(&bd->bd_list_tr)) - goto out; - tr->tr_touched = 1; - if (gfs2_is_jdata(ip)) { - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); - } - } - if (!list_empty(&le->le_list)) - goto out; - - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - if (gfs2_is_jdata(ip)) { - gfs2_pin(sdp, bd->bd_bh); - tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add(&le->le_list, &sdp->sd_log_le_databuf); - } else { - list_add(&le->le_list, &sdp->sd_log_le_ordered); - } -out: - gfs2_log_unlock(sdp); - unlock_buffer(bd->bd_bh); -} - -static void gfs2_check_magic(struct buffer_head *bh) -{ - void *kaddr; - __be32 *ptr; + jd->jd_jid, jd->jd_found_revokes); - clear_buffer_escaped(bh); - kaddr = kmap_atomic(bh->b_page, KM_USER0); - ptr = kaddr + bh_offset(bh); - if (*ptr == cpu_to_be32(GFS2_MAGIC)) - set_buffer_escaped(bh); - kunmap_atomic(kaddr, KM_USER0); -} - -static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct list_head *list, struct list_head *done, - unsigned int n) -{ - struct buffer_head *bh1; - struct gfs2_log_descriptor *ld; - struct gfs2_bufdata *bd; - __be64 *ptr; - - if (!bh) - return; - - ld = bh_log_desc(bh); - ld->ld_length = cpu_to_be32(n + 1); - ld->ld_data1 = cpu_to_be32(n); - - ptr = bh_log_ptr(bh); - - get_bh(bh); - submit_bh(WRITE, bh); - gfs2_log_lock(sdp); - while(!list_empty(list)) { - bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); - list_move_tail(&bd->bd_le.le_list, done); - get_bh(bd->bd_bh); - while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) { - gfs2_log_incr_head(sdp); - ptr += 2; - } - gfs2_log_unlock(sdp); - lock_buffer(bd->bd_bh); - if (buffer_escaped(bd->bd_bh)) { - void *kaddr; - bh1 = gfs2_log_get_buf(sdp); - kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0); - memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh), - bh1->b_size); - kunmap_atomic(kaddr, KM_USER0); - *(__be32 *)bh1->b_data = 0; - clear_buffer_escaped(bd->bd_bh); - unlock_buffer(bd->bd_bh); - brelse(bd->bd_bh); - } else { - bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); - } - submit_bh(WRITE, bh1); - gfs2_log_lock(sdp); - ptr += 2; - } - gfs2_log_unlock(sdp); - brelse(bh); + gfs2_revoke_clean(jd); } /** @@ -613,39 +754,14 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, * */ -static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct gfs2_bufdata *bd = NULL; - struct buffer_head *bh = NULL; - unsigned int n = 0; - __be64 *ptr = NULL, *end = NULL; - LIST_HEAD(processed); - LIST_HEAD(in_progress); - - gfs2_log_lock(sdp); - while (!list_empty(&sdp->sd_log_le_databuf)) { - if (ptr == end) { - gfs2_log_unlock(sdp); - gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); - n = 0; - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA); - ptr = bh_log_ptr(bh); - end = bh_ptr_end(bh) - 1; - gfs2_log_lock(sdp); - continue; - } - bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list); - list_move_tail(&bd->bd_le.le_list, &in_progress); - gfs2_check_magic(bd->bd_bh); - *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr); - *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0); - n++; - } - gfs2_log_unlock(sdp); - gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); - gfs2_log_lock(sdp); - list_splice(&processed, &sdp->sd_log_le_databuf); - gfs2_log_unlock(sdp); + unsigned int limit = databuf_limit(sdp); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); } static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -669,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, blkno = be64_to_cpu(*ptr++); esc = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -690,10 +806,8 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - if (error) - break; - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; @@ -717,27 +831,27 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_databuf; + struct list_head *head; struct gfs2_bufdata *bd; + if (tr == NULL) + return; + + head = &tr->tr_databuf; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); - sdp->sd_log_num_databuf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } const struct gfs2_log_operations gfs2_buf_lops = { - .lo_add = buf_lo_add, - .lo_incore_commit = buf_lo_incore_commit, .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -747,23 +861,15 @@ const struct gfs2_log_operations gfs2_buf_lops = { }; const struct gfs2_log_operations gfs2_revoke_lops = { - .lo_add = revoke_lo_add, .lo_before_commit = revoke_lo_before_commit, + .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, .lo_scan_elements = revoke_lo_scan_elements, .lo_after_scan = revoke_lo_after_scan, .lo_name = "revoke", }; -const struct gfs2_log_operations gfs2_rg_lops = { - .lo_add = rg_lo_add, - .lo_after_commit = rg_lo_after_commit, - .lo_name = "rg", -}; - const struct gfs2_log_operations gfs2_databuf_lops = { - .lo_add = databuf_lo_add, - .lo_incore_commit = buf_lo_incore_commit, .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, @@ -774,7 +880,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = { const struct gfs2_log_operations *gfs2_log_ops[] = { &gfs2_databuf_lops, &gfs2_buf_lops, - &gfs2_rg_lops, &gfs2_revoke_lops, NULL, }; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 41a00df7558..a65a7ba32ff 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -23,10 +23,12 @@ extern const struct gfs2_log_operations gfs2_glock_lops; extern const struct gfs2_log_operations gfs2_buf_lops; extern const struct gfs2_log_operations gfs2_revoke_lops; -extern const struct gfs2_log_operations gfs2_rg_lops; extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; +extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); +extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { @@ -44,42 +46,22 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_init_le(struct gfs2_log_element *le, - const struct gfs2_log_operations *lops) -{ - INIT_LIST_HEAD(&le->le_list); - le->le_ops = lops; -} - -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) -{ - if (le->le_ops->lo_add) - le->le_ops->lo_add(sdp, le); -} - -static inline void lops_incore_commit(struct gfs2_sbd *sdp, +static inline void lops_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) - if (gfs2_log_ops[x]->lo_incore_commit) - gfs2_log_ops[x]->lo_incore_commit(sdp, tr); -} - -static inline void lops_before_commit(struct gfs2_sbd *sdp) -{ - int x; - for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_before_commit) - gfs2_log_ops[x]->lo_before_commit(sdp); + gfs2_log_ops[x]->lo_before_commit(sdp, tr); } -static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_after_commit) - gfs2_log_ops[x]->lo_after_commit(sdp, ai); + gfs2_log_ops[x]->lo_after_commit(sdp, tr); } static inline void lops_before_scan(struct gfs2_jdesc *jd, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 9c7765c12d6..82b6ac82965 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -14,39 +16,54 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <asm/atomic.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/atomic.h> +#include <linux/mempool.h> #include "gfs2.h" #include "incore.h" -#include "ops_fstype.h" +#include "super.h" #include "sys.h" #include "util.h" #include "glock.h" +#include "quota.h" +#include "recovery.h" +#include "dir.h" + +struct workqueue_struct *gfs2_control_wq; -static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo) +static void gfs2_init_inode_once(void *foo) { struct gfs2_inode *ip = foo; inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); - ip->i_alloc = NULL; + INIT_LIST_HEAD(&ip->i_trunc_list); + ip->i_res = NULL; + ip->i_hash_cache = NULL; } -static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo) +static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_NODE(&gl->gl_list); + INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_spin); INIT_LIST_HEAD(&gl->gl_holders); - INIT_LIST_HEAD(&gl->gl_waiters1); - INIT_LIST_HEAD(&gl->gl_waiters3); - gl->gl_lvb = NULL; - atomic_set(&gl->gl_lvb_count, 0); - INIT_LIST_HEAD(&gl->gl_reclaim); + INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); + atomic_set(&gl->gl_revokes, 0); +} + +static void gfs2_init_gl_aspace_once(void *foo) +{ + struct gfs2_glock *gl = foo; + struct address_space *mapping = (struct address_space *)(gl + 1); + + gfs2_init_glock_once(gl); + address_space_init_once(mapping); } /** @@ -59,10 +76,18 @@ static int __init init_gfs2_fs(void) { int error; + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); + error = gfs2_sys_init(); if (error) return error; + error = list_lru_init(&gfs2_qd_lru); + if (error) + goto fail_lru; + error = gfs2_glock_init(); if (error) goto fail; @@ -75,6 +100,14 @@ static int __init init_gfs2_fs(void) if (!gfs2_glock_cachep) goto fail; + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", + sizeof(struct gfs2_glock) + + sizeof(struct address_space), + 0, 0, gfs2_init_gl_aspace_once); + + if (!gfs2_glock_aspace_cachep) + goto fail; + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| @@ -89,6 +122,26 @@ static int __init init_gfs2_fs(void) if (!gfs2_bufdata_cachep) goto fail; + gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", + sizeof(struct gfs2_rgrpd), + 0, 0, NULL); + if (!gfs2_rgrpd_cachep) + goto fail; + + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, 0, NULL); + if (!gfs2_quotad_cachep) + goto fail; + + gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk", + sizeof(struct gfs2_blkreserv), + 0, 0, NULL); + if (!gfs2_rsrv_cachep) + goto fail; + + register_shrinker(&gfs2_qd_shrinker); + error = register_filesystem(&gfs2_fs_type); if (error) goto fail; @@ -97,23 +150,59 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + if (!gfs_recovery_wq) + goto fail_wq; + + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_recovery; + + gfs2_page_pool = mempool_create_page_pool(64, 0); + if (!gfs2_page_pool) + goto fail_control; + gfs2_register_debugfs(); - printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); + pr_info("GFS2 installed\n"); return 0; +fail_control: + destroy_workqueue(gfs2_control_wq); +fail_recovery: + destroy_workqueue(gfs_recovery_wq); +fail_wq: + unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: + list_lru_destroy(&gfs2_qd_lru); +fail_lru: + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); + if (gfs2_rsrv_cachep) + kmem_cache_destroy(gfs2_rsrv_cachep); + + if (gfs2_quotad_cachep) + kmem_cache_destroy(gfs2_quotad_cachep); + + if (gfs2_rgrpd_cachep) + kmem_cache_destroy(gfs2_rgrpd_cachep); + if (gfs2_bufdata_cachep) kmem_cache_destroy(gfs2_bufdata_cachep); if (gfs2_inode_cachep) kmem_cache_destroy(gfs2_inode_cachep); + if (gfs2_glock_aspace_cachep) + kmem_cache_destroy(gfs2_glock_aspace_cachep); + if (gfs2_glock_cachep) kmem_cache_destroy(gfs2_glock_cachep); @@ -128,13 +217,24 @@ fail: static void __exit exit_gfs2_fs(void) { + unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_control_wq); + list_lru_destroy(&gfs2_qd_lru); + + rcu_barrier(); + mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_rsrv_cachep); + kmem_cache_destroy(gfs2_quotad_cachep); + kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_aspace_cachep); kmem_cache_destroy(gfs2_glock_cachep); gfs2_sys_uninit(); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 85aea27b4a8..b984a6e190b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -19,7 +19,6 @@ #include <linux/delay.h> #include <linux/bio.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" @@ -32,102 +31,79 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "ops_address.h" +#include "trace_gfs2.h" -static int aspace_get_block(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - gfs2_assert_warn(inode->i_sb->s_fs_info, 0); - return -EOPNOTSUPP; -} + struct buffer_head *bh, *head; + int nr_underway = 0; + int write_op = REQ_META | REQ_PRIO | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from flusher thread and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); -static int gfs2_aspace_writepage(struct page *page, - struct writeback_control *wbc) -{ - return block_write_full_page(page, aspace_get_block, wbc); + if (nr_underway == 0) + end_page_writeback(page); + + return 0; } -static const struct address_space_operations aspace_aops = { +const struct address_space_operations gfs2_meta_aops = { .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, - .sync_page = block_sync_page, }; -/** - * gfs2_aspace_get - Create and initialize a struct inode structure - * @sdp: the filesystem the aspace is in - * - * Right now a struct inode is just a struct inode. Maybe Linux - * will supply a more lightweight address space construct (that works) - * in the future. - * - * Make sure pages/buffers in this aspace aren't in high memory. - * - * Returns: the aspace - */ - -struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) -{ - struct inode *aspace; - - aspace = new_inode(sdp->sd_vfs); - if (aspace) { - mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); - aspace->i_mapping->a_ops = &aspace_aops; - aspace->i_size = ~0ULL; - aspace->i_private = NULL; - insert_inode_hash(aspace); - } - return aspace; -} - -void gfs2_aspace_put(struct inode *aspace) -{ - remove_inode_hash(aspace); - iput(aspace); -} - -/** - * gfs2_meta_inval - Invalidate all buffers associated with a glock - * @gl: the glock - * - */ - -void gfs2_meta_inval(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - struct inode *aspace = gl->gl_aspace; - struct address_space *mapping = gl->gl_aspace->i_mapping; - - gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); - - atomic_inc(&aspace->i_writecount); - truncate_inode_pages(mapping, 0); - atomic_dec(&aspace->i_writecount); - - gfs2_assert_withdraw(sdp, !mapping->nrpages); -} - -/** - * gfs2_meta_sync - Sync all buffers associated with a glock - * @gl: The glock - * - */ - -void gfs2_meta_sync(struct gfs2_glock *gl) -{ - struct address_space *mapping = gl->gl_aspace->i_mapping; - int error; - - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - - if (error) - gfs2_io_error(gl->gl_sbd); -} +const struct address_space_operations gfs2_rgrp_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; /** - * getbuf - Get a buffer with a given address space + * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock * @blkno: the block number (filesystem scope) * @create: 1 if the buffer should be created @@ -135,9 +111,9 @@ void gfs2_meta_sync(struct gfs2_glock *gl) * Returns: the buffer */ -static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { - struct address_space *mapping = gl->gl_aspace->i_mapping; + struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_sbd; struct page *page; struct buffer_head *bh; @@ -145,6 +121,9 @@ static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned long index; unsigned int bufnum; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ @@ -157,7 +136,8 @@ static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) yield(); } } else { - page = find_lock_page(mapping, index); + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); if (!page) return NULL; } @@ -174,7 +154,6 @@ static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); return bh; @@ -203,7 +182,7 @@ static void meta_prep_new(struct buffer_head *bh) struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) { struct buffer_head *bh; - bh = getbuf(gl, blkno, CREATE); + bh = gfs2_getbuf(gl, blkno, CREATE); meta_prep_new(bh); return bh; } @@ -221,16 +200,35 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { - *bhp = getbuf(gl, blkno, CREATE); - if (!buffer_uptodate(*bhp)) { - ll_rw_block(READ_META, 1, bhp); - if (flags & DIO_WAIT) { - int error = gfs2_meta_wait(gl->gl_sbd, *bhp); - if (error) { - brelse(*bhp); - return error; - } - } + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *bh; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + *bhp = NULL; + return -EIO; + } + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + if (!(flags & DIO_WAIT)) + return 0; + + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + *bhp = NULL; + return -EIO; } return 0; @@ -263,68 +261,34 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -/** - * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer - * @gl: the glock the buffer belongs to - * @bh: The buffer to be attached to - * @meta: Flag to indicate whether its metadata or not - */ - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta) -{ - struct gfs2_bufdata *bd; - - if (meta) - lock_page(bh->b_page); - - if (bh->b_private) { - if (meta) - unlock_page(bh->b_page); - return; - } - - bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); - bd->bd_bh = bh; - bd->bd_gl = gl; - - INIT_LIST_HEAD(&bd->bd_list_tr); - if (meta) - lops_init_le(&bd->bd_le, &gfs2_buf_lops); - else - lops_init_le(&bd->bd_le, &gfs2_databuf_lops); - bh->b_private = bd; - - if (meta) - unlock_page(bh->b_page); -} - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) { - struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; + if (test_clear_buffer_pinned(bh)) { - list_del_init(&bd->bd_le.le_list); - if (meta) { - gfs2_assert_warn(sdp, sdp->sd_log_num_buf); - sdp->sd_log_num_buf--; + trace_gfs2_pin(bd, 0); + atomic_dec(&sdp->sd_log_pinned); + list_del_init(&bd->bd_list); + if (meta) tr->tr_num_buf_rm++; - } else { - gfs2_assert_warn(sdp, sdp->sd_log_num_databuf); - sdp->sd_log_num_databuf--; + else tr->tr_num_databuf_rm++; - } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { - if (bd->bd_ail) { - gfs2_remove_from_ail(bd); - bh->b_private = NULL; - bd->bd_bh = NULL; - bd->bd_blkno = bh->b_blocknr; + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } + spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); @@ -344,7 +308,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) struct buffer_head *bh; while (blen) { - bh = getbuf(ip->i_gl, bstart, NO_CREATE); + bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); @@ -364,33 +328,24 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) * @ip: The GFS2 inode * @height: The level of this buf in the metadata (indir addr) tree (if any) * @num: The block number (device relative) of the buffer - * @new: Non-zero if we may create a new buffer * @bhp: the buffer is returned here * * Returns: errno */ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - int new, struct buffer_head **bhp) + struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_glock *gl = ip->i_gl; struct buffer_head *bh; int ret = 0; + u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; - if (new) { - BUG_ON(height == 0); - bh = gfs2_meta_new(gl, num); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); - gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); - } else { - u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); - if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { - brelse(bh); - ret = -EIO; - } + ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; } *bhp = bh; return ret; @@ -419,21 +374,21 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (extlen > max_ra) extlen = max_ra; - first_bh = getbuf(gl, dblock, CREATE); + first_bh = gfs2_getbuf(gl, dblock, CREATE); if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_META, 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; while (extlen) { - bh = getbuf(gl, dblock, CREATE); + bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(READA, 1, &bh); + ll_rw_block(READA | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 73e3b1c76fe..ac5d8027d33 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -37,40 +37,42 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, 0, from_head - to_head); } -struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); -void gfs2_aspace_put(struct inode *aspace); +extern const struct address_space_operations gfs2_meta_aops; +extern const struct address_space_operations gfs2_rgrp_aops; -void gfs2_meta_inval(struct gfs2_glock *gl); -void gfs2_meta_sync(struct gfs2_glock *gl); - -struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, - int flags, struct buffer_head **bhp); -int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta); - -void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, - int meta); - -void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + if (mapping->a_ops == &gfs2_meta_aops) + return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else if (mapping->a_ops == &gfs2_rgrp_aops) + return container_of(mapping, struct gfs2_sbd, sd_aspace); + else + return inode->i_sb->s_fs_info; +} -int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - int new, struct buffer_head **bhp); +extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp); +extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); +extern void gfs2_remove_from_journal(struct buffer_head *bh, + struct gfs2_trans *tr, int meta); +extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) { - return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp); + return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp); } struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); #define buffer_busy(bh) \ ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) -#define buffer_in_io(bh) \ -((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock))) #endif /* __DIO_DOT_H__ */ diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c deleted file mode 100644 index b941f9f9f95..00000000000 --- a/fs/gfs2/mount.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <linux/parser.h> - -#include "gfs2.h" -#include "incore.h" -#include "mount.h" -#include "sys.h" -#include "util.h" - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_num_glockd, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_err, -}; - -static match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_num_glockd, "num_glockd=%d"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_err, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @sdp: - * @data: - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) -{ - struct gfs2_args *args = &sdp->sd_args; - char *data = data_arg; - char *options, *o, *v; - int error = 0; - - if (!remount) { - /* If someone preloaded options, use those instead */ - spin_lock(&gfs2_sys_margs_lock); - if (gfs2_sys_margs) { - data = gfs2_sys_margs; - gfs2_sys_margs = NULL; - } - spin_unlock(&gfs2_sys_margs_lock); - - /* Set some defaults */ - args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; - args->ar_quota = GFS2_QUOTA_DEFAULT; - args->ar_data = GFS2_DATA_DEFAULT; - } - - /* Split the options into tokens with the "," character and - process them */ - - for (options = data; (o = strsep(&options, ",")); ) { - int token, option; - substring_t tmp[MAX_OPT_ARGS]; - - if (!*o) - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for lockproto\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_lockproto)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN); - args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); - break; - case Opt_locktable: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for locktable\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_locktable)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN); - args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); - break; - case Opt_hostdata: - v = match_strdup(&tmp[0]); - if (!v) { - fs_info(sdp, "no memory for hostdata\n"); - error = -ENOMEM; - goto out_error; - } - - if (remount && strcmp(v, args->ar_hostdata)) { - kfree(v); - goto cant_remount; - } - - strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN); - args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0; - kfree(v); - break; - case Opt_spectator: - if (remount && !args->ar_spectator) - goto cant_remount; - args->ar_spectator = 1; - sdp->sd_vfs->s_flags |= MS_RDONLY; - break; - case Opt_ignore_local_fs: - if (remount && !args->ar_ignore_local_fs) - goto cant_remount; - args->ar_ignore_local_fs = 1; - break; - case Opt_localflocks: - if (remount && !args->ar_localflocks) - goto cant_remount; - args->ar_localflocks = 1; - break; - case Opt_localcaching: - if (remount && !args->ar_localcaching) - goto cant_remount; - args->ar_localcaching = 1; - break; - case Opt_debug: - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - if (remount && !args->ar_upgrade) - goto cant_remount; - args->ar_upgrade = 1; - break; - case Opt_num_glockd: - if ((error = match_int(&tmp[0], &option))) { - fs_info(sdp, "problem getting num_glockd\n"); - goto out_error; - } - - if (remount && option != args->ar_num_glockd) - goto cant_remount; - if (!option || option > GFS2_GLOCKD_MAX) { - fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", - GFS2_GLOCKD_MAX, option); - error = -EINVAL; - goto out_error; - } - args->ar_num_glockd = option; - break; - case Opt_acl: - args->ar_posix_acl = 1; - sdp->sd_vfs->s_flags |= MS_POSIXACL; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - sdp->sd_vfs->s_flags &= ~MS_POSIXACL; - break; - case Opt_quota_off: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_err: - default: - fs_info(sdp, "unknown option: %s\n", o); - error = -EINVAL; - goto out_error; - } - } - -out_error: - if (error) - fs_info(sdp, "invalid mount option(s)\n"); - - if (data != data_arg) - kfree(data); - - return error; - -cant_remount: - fs_info(sdp, "can't remount with option %s\n", o); - return -EINVAL; -} - diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h deleted file mode 100644 index 401288acfdf..00000000000 --- a/fs/gfs2/mount.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __MOUNT_DOT_H__ -#define __MOUNT_DOT_H__ - -struct gfs2_sbd; - -int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount); - -#endif /* __MOUNT_DOT_H__ */ diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h deleted file mode 100644 index 5da21285bba..00000000000 --- a/fs/gfs2/ops_address.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_ADDRESS_DOT_H__ -#define __OPS_ADDRESS_DOT_H__ - -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/mm.h> - -extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); -extern int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_aops(struct inode *inode); - -#endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h deleted file mode 100644 index 5caa3db4d3f..00000000000 --- a/fs/gfs2/ops_dentry.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_DENTRY_DOT_H__ -#define __OPS_DENTRY_DOT_H__ - -#include <linux/dcache.h> - -extern struct dentry_operations gfs2_dops; - -#endif /* __OPS_DENTRY_DOT_H__ */ diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c deleted file mode 100644 index f4842f2548c..00000000000 --- a/fs/gfs2/ops_file.c +++ /dev/null @@ -1,771 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/pagemap.h> -#include <linux/uio.h> -#include <linux/blkdev.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/gfs2_ondisk.h> -#include <linux/ext2_fs.h> -#include <linux/crc32.h> -#include <linux/lm_interface.h> -#include <linux/writeback.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "bmap.h" -#include "dir.h" -#include "glock.h" -#include "glops.h" -#include "inode.h" -#include "lm.h" -#include "log.h" -#include "meta_io.h" -#include "quota.h" -#include "rgrp.h" -#include "trans.h" -#include "util.h" -#include "eaops.h" -#include "ops_address.h" - -/** - * gfs2_llseek - seek to a location in a file - * @file: the file - * @offset: the offset - * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) - * - * SEEK_END requires the glock for the file because it references the - * file's size. - * - * Returns: The new offset, or errno - */ - -static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - loff_t error; - - if (origin == 2) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, - &i_gh); - if (!error) { - error = remote_llseek(file, offset, origin); - gfs2_glock_dq_uninit(&i_gh); - } - } else - error = remote_llseek(file, offset, origin); - - return error; -} - -/** - * gfs2_readdir - Read directory entries from a directory - * @file: The directory to read from - * @dirent: Buffer for dirents - * @filldir: Function used to do the copying - * - * Returns: errno - */ - -static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) -{ - struct inode *dir = file->f_mapping->host; - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_holder d_gh; - u64 offset = file->f_pos; - int error; - - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); - error = gfs2_glock_nq_atime(&d_gh); - if (error) { - gfs2_holder_uninit(&d_gh); - return error; - } - - error = gfs2_dir_read(dir, &offset, dirent, filldir); - - gfs2_glock_dq_uninit(&d_gh); - - file->f_pos = offset; - - return error; -} - -/** - * fsflags_cvt - * @table: A table of 32 u32 flags - * @val: a 32 bit value to convert - * - * This function can be used to convert between fsflags values and - * GFS2's own flags values. - * - * Returns: the converted flags - */ -static u32 fsflags_cvt(const u32 *table, u32 val) -{ - u32 res = 0; - while(val) { - if (val & 1) - res |= *table; - table++; - val >>= 1; - } - return res; -} - -static const u32 fsflags_to_gfs2[32] = { - [3] = GFS2_DIF_SYNC, - [4] = GFS2_DIF_IMMUTABLE, - [5] = GFS2_DIF_APPENDONLY, - [7] = GFS2_DIF_NOATIME, - [12] = GFS2_DIF_EXHASH, - [14] = GFS2_DIF_INHERIT_JDATA, - [20] = GFS2_DIF_INHERIT_DIRECTIO, -}; - -static const u32 gfs2_to_fsflags[32] = { - [gfs2fl_Sync] = FS_SYNC_FL, - [gfs2fl_Immutable] = FS_IMMUTABLE_FL, - [gfs2fl_AppendOnly] = FS_APPEND_FL, - [gfs2fl_NoAtime] = FS_NOATIME_FL, - [gfs2fl_ExHash] = FS_INDEX_FL, - [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL, - [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, -}; - -static int gfs2_get_flags(struct file *filp, u32 __user *ptr) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - u32 fsflags; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - error = gfs2_glock_nq_atime(&gh); - if (error) - return error; - - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); - if (!S_ISDIR(inode->i_mode)) { - if (ip->i_di.di_flags & GFS2_DIF_JDATA) - fsflags |= FS_JOURNAL_DATA_FL; - if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) - fsflags |= FS_DIRECTIO_FL; - } - if (put_user(fsflags, ptr)) - error = -EFAULT; - - gfs2_glock_dq(&gh); - gfs2_holder_uninit(&gh); - return error; -} - -void gfs2_set_inode_flags(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_dinode_host *di = &ip->i_di; - unsigned int flags = inode->i_flags; - - flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (di->di_flags & GFS2_DIF_IMMUTABLE) - flags |= S_IMMUTABLE; - if (di->di_flags & GFS2_DIF_APPENDONLY) - flags |= S_APPEND; - if (di->di_flags & GFS2_DIF_NOATIME) - flags |= S_NOATIME; - if (di->di_flags & GFS2_DIF_SYNC) - flags |= S_SYNC; - inode->i_flags = flags; -} - -/* Flags that can be set by user space */ -#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ - GFS2_DIF_DIRECTIO| \ - GFS2_DIF_IMMUTABLE| \ - GFS2_DIF_APPENDONLY| \ - GFS2_DIF_NOATIME| \ - GFS2_DIF_SYNC| \ - GFS2_DIF_SYSTEM| \ - GFS2_DIF_INHERIT_DIRECTIO| \ - GFS2_DIF_INHERIT_JDATA) - -/** - * gfs2_set_flags - set flags on an inode - * @inode: The inode - * @flags: The flags to set - * @mask: Indicates which flags are valid - * - */ -static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *bh; - struct gfs2_holder gh; - int error; - u32 new_flags, flags; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - return error; - - flags = ip->i_di.di_flags; - new_flags = (flags & ~mask) | (reqflags & mask); - if ((new_flags ^ flags) == 0) - goto out; - - error = -EINVAL; - if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) - goto out; - - error = -EPERM; - if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) - goto out; - if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) - goto out; - if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && - !capable(CAP_LINUX_IMMUTABLE)) - goto out; - if (!IS_IMMUTABLE(inode)) { - error = permission(inode, MAY_WRITE, NULL); - if (error) - goto out; - } - if ((flags ^ new_flags) & GFS2_DIF_JDATA) { - if (flags & GFS2_DIF_JDATA) - gfs2_log_flush(sdp, ip->i_gl); - error = filemap_fdatawrite(inode->i_mapping); - if (error) - goto out; - error = filemap_fdatawait(inode->i_mapping); - if (error) - goto out; - } - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - goto out; - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_trans_end; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_flags = new_flags; - gfs2_dinode_out(ip, bh->b_data); - brelse(bh); - gfs2_set_inode_flags(inode); - gfs2_set_aops(inode); -out_trans_end: - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); - return error; -} - -static int gfs2_set_flags(struct file *filp, u32 __user *ptr) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - u32 fsflags, gfsflags; - if (get_user(fsflags, ptr)) - return -EFAULT; - gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); - if (!S_ISDIR(inode->i_mode)) { - if (gfsflags & GFS2_DIF_INHERIT_JDATA) - gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO) - gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO); - return do_gfs2_set_flags(filp, gfsflags, ~0); - } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); -} - -static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - switch(cmd) { - case FS_IOC_GETFLAGS: - return gfs2_get_flags(filp, (u32 __user *)arg); - case FS_IOC_SETFLAGS: - return gfs2_set_flags(filp, (u32 __user *)arg); - } - return -ENOTTY; -} - -/** - * gfs2_allocate_page_backing - Use bmap to allocate blocks - * @page: The (locked) page to allocate backing for - * - * We try to allocate all the blocks required for the page in - * one go. This might fail for various reasons, so we keep - * trying until all the blocks to back this page are allocated. - * If some of the blocks are already allocated, thats ok too. - */ - -static int gfs2_allocate_page_backing(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct buffer_head bh; - unsigned long size = PAGE_CACHE_SIZE; - u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - do { - bh.b_state = 0; - bh.b_size = size; - gfs2_block_map(inode, lblock, &bh, 1); - if (!buffer_mapped(&bh)) - return -EIO; - size -= bh.b_size; - lblock += (bh.b_size >> inode->i_blkbits); - } while(size > 0); - return 0; -} - -/** - * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable - * @vma: The virtual memory area - * @page: The page which is about to become writable - * - * When the page becomes writable, we need to ensure that we have - * blocks allocated on disk to back that page. - */ - -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned long last_index; - u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); - unsigned int data_blocks, ind_blocks, rblocks; - int alloc_required = 0; - struct gfs2_holder gh; - struct gfs2_alloc *al; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); - if (ret) - goto out; - - set_bit(GIF_SW_PAGED, &ip->i_flags); - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); - if (ret || !alloc_required) - goto out_unlock; - ret = -ENOMEM; - al = gfs2_alloc_get(ip); - if (al == NULL) - goto out_unlock; - - ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (ret) - goto out_alloc_put; - ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); - if (ret) - goto out_quota_unlock; - al->al_requested = data_blocks + ind_blocks; - ret = gfs2_inplace_reserve(ip); - if (ret) - goto out_quota_unlock; - - rblocks = RES_DINODE + ind_blocks; - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) - rblocks += RES_STATFS + RES_QUOTA; - ret = gfs2_trans_begin(sdp, rblocks, 0); - if (ret) - goto out_trans_fail; - - lock_page(page); - ret = -EINVAL; - last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) - goto out_unlock_page; - ret = 0; - if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) - goto out_unlock_page; - if (gfs2_is_stuffed(ip)) { - ret = gfs2_unstuff_dinode(ip, page); - if (ret) - goto out_unlock_page; - } - ret = gfs2_allocate_page_backing(page); - -out_unlock_page: - unlock_page(page); - gfs2_trans_end(sdp); -out_trans_fail: - gfs2_inplace_release(ip); -out_quota_unlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); -out_unlock: - gfs2_glock_dq(&gh); -out: - gfs2_holder_uninit(&gh); - return ret; -} - -static struct vm_operations_struct gfs2_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = gfs2_page_mkwrite, -}; - - -/** - * gfs2_mmap - - * @file: The file to map - * @vma: The VMA which described the mapping - * - * Returns: 0 or error code - */ - -static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - return error; - } - - vma->vm_ops = &gfs2_vm_ops; - - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -/** - * gfs2_open - open a file - * @inode: the inode to open - * @file: the struct file for this opening - * - * Returns: errno - */ - -static int gfs2_open(struct inode *inode, struct file *file) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; - struct gfs2_file *fp; - int error; - - fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); - if (!fp) - return -ENOMEM; - - mutex_init(&fp->f_fl_mutex); - - gfs2_assert_warn(GFS2_SB(inode), !file->private_data); - file->private_data = fp; - - if (S_ISREG(ip->i_inode.i_mode)) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, - &i_gh); - if (error) - goto fail; - - if (!(file->f_flags & O_LARGEFILE) && - ip->i_di.di_size > MAX_NON_LFS) { - error = -EOVERFLOW; - goto fail_gunlock; - } - - /* Listen to the Direct I/O flag */ - - if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) - file->f_flags |= O_DIRECT; - - gfs2_glock_dq_uninit(&i_gh); - } - - return 0; - -fail_gunlock: - gfs2_glock_dq_uninit(&i_gh); -fail: - file->private_data = NULL; - kfree(fp); - return error; -} - -/** - * gfs2_close - called to close a struct file - * @inode: the inode the struct file belongs to - * @file: the struct file being closed - * - * Returns: errno - */ - -static int gfs2_close(struct inode *inode, struct file *file) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_file *fp; - - fp = file->private_data; - file->private_data = NULL; - - if (gfs2_assert_warn(sdp, fp)) - return -EIO; - - kfree(fp); - - return 0; -} - -/** - * gfs2_fsync - sync the dirty data for a file (across the cluster) - * @file: the file that points to the dentry (we ignore this) - * @dentry: the dentry that points to the inode to sync - * - * The VFS will flush "normal" data for us. We only need to worry - * about metadata here. For journaled data, we just do a log flush - * as we can't avoid it. Otherwise we can just bale out if datasync - * is set. For stuffed inodes we must flush the log in order to - * ensure that all data is on disk. - * - * The call to write_inode_now() is there to write back metadata and - * the inode itself. It does also try and write the data, but thats - * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite() - * for us. - * - * Returns: errno - */ - -static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); - int ret = 0; - - if (gfs2_is_jdata(GFS2_I(inode))) { - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); - return 0; - } - - if (sync_state != 0) { - if (!datasync) - ret = write_inode_now(inode, 0); - - if (gfs2_is_stuffed(GFS2_I(inode))) - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); - } - - return ret; -} - -/** - * gfs2_setlease - acquire/release a file lease - * @file: the file pointer - * @arg: lease type - * @fl: file lock - * - * Returns: errno - */ - -static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) -{ - struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); - - /* - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - */ - if (!sdp->sd_args.ar_localflocks) - return -EINVAL; - return generic_setlease(file, arg, fl); -} - -/** - * gfs2_lock - acquire/release a posix lock on a file - * @file: the file pointer - * @cmd: either modify or retrieve lock state, possibly wait - * @fl: type and range of lock - * - * Returns: errno - */ - -static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); - struct lm_lockname name = - { .ln_number = ip->i_no_addr, - .ln_type = LM_TYPE_PLOCK }; - - if (!(fl->fl_flags & FL_POSIX)) - return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) - return -ENOLCK; - - if (cmd == F_CANCELLK) { - /* Hack: */ - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - if (IS_GETLK(cmd)) - return gfs2_lm_plock_get(sdp, &name, file, fl); - else if (fl->fl_type == F_UNLCK) - return gfs2_lm_punlock(sdp, &name, file, fl); - else - return gfs2_lm_plock(sdp, &name, file, cmd, fl); -} - -static int do_flock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_file *fp = file->private_data; - struct gfs2_holder *fl_gh = &fp->f_fl_gh; - struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); - struct gfs2_glock *gl; - unsigned int state; - int flags; - int error = 0; - - state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE - | GL_FLOCK; - - mutex_lock(&fp->f_fl_mutex); - - gl = fl_gh->gh_gl; - if (gl) { - if (fl_gh->gh_state == state) - goto out; - flock_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); - gfs2_holder_reinit(state, flags, fl_gh); - } else { - error = gfs2_glock_get(GFS2_SB(&ip->i_inode), - ip->i_no_addr, &gfs2_flock_glops, - CREATE, &gl); - if (error) - goto out; - gfs2_holder_init(gl, state, flags, fl_gh); - gfs2_glock_put(gl); - } - error = gfs2_glock_nq(fl_gh); - if (error) { - gfs2_holder_uninit(fl_gh); - if (error == GLR_TRYFAILED) - error = -EAGAIN; - } else { - error = flock_lock_file_wait(file, fl); - gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); - } - -out: - mutex_unlock(&fp->f_fl_mutex); - return error; -} - -static void do_unflock(struct file *file, struct file_lock *fl) -{ - struct gfs2_file *fp = file->private_data; - struct gfs2_holder *fl_gh = &fp->f_fl_gh; - - mutex_lock(&fp->f_fl_mutex); - flock_lock_file_wait(file, fl); - if (fl_gh->gh_gl) - gfs2_glock_dq_uninit(fl_gh); - mutex_unlock(&fp->f_fl_mutex); -} - -/** - * gfs2_flock - acquire/release a flock lock on a file - * @file: the file pointer - * @cmd: either modify or retrieve lock state, possibly wait - * @fl: type and range of lock - * - * Returns: errno - */ - -static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) - return -ENOLCK; - - if (fl->fl_type == F_UNLCK) { - do_unflock(file, fl); - return 0; - } else { - return do_flock(file, cmd, fl); - } -} - -const struct file_operations gfs2_file_fops = { - .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .unlocked_ioctl = gfs2_ioctl, - .mmap = gfs2_mmap, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .lock = gfs2_lock, - .flock = gfs2_flock, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, - .setlease = gfs2_setlease, -}; - -const struct file_operations gfs2_dir_fops = { - .readdir = gfs2_readdir, - .unlocked_ioctl = gfs2_ioctl, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .lock = gfs2_lock, - .flock = gfs2_flock, -}; - -const struct file_operations gfs2_file_fops_nolock = { - .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .unlocked_ioctl = gfs2_ioctl, - .mmap = gfs2_mmap, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, - .setlease = gfs2_setlease, -}; - -const struct file_operations gfs2_dir_fops_nolock = { - .readdir = gfs2_readdir, - .unlocked_ioctl = gfs2_ioctl, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, -}; - diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4bee6aa845e..bc564c0d6d1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1,12 +1,14 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -14,36 +16,56 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/kthread.h> +#include <linux/export.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/quotaops.h> +#include <linux/lockdep.h> +#include <linux/module.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" -#include "daemon.h" #include "glock.h" #include "glops.h" #include "inode.h" -#include "lm.h" -#include "mount.h" -#include "ops_fstype.h" -#include "ops_dentry.h" -#include "ops_super.h" #include "recovery.h" #include "rgrp.h" #include "super.h" #include "sys.h" #include "util.h" #include "log.h" +#include "quota.h" +#include "dir.h" +#include "meta_io.h" +#include "trace_gfs2.h" #define DO 0 #define UNDO 1 +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = 1 << 18; + gt->gt_complain_secs = 10; +} + static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -51,68 +73,287 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) sb->s_fs_info = sdp; sdp->sd_vfs = sb; + sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); + if (!sdp->sd_lkstats) { + kfree(sdp); + return NULL; + } + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); - INIT_LIST_HEAD(&sdp->sd_reclaim_list); - spin_lock_init(&sdp->sd_reclaim_lock); - init_waitqueue_head(&sdp->sd_reclaim_wq); - - mutex_init(&sdp->sd_inum_mutex); + init_waitqueue_head(&sdp->sd_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); - mutex_init(&sdp->sd_rindex_mutex); - INIT_LIST_HEAD(&sdp->sd_rindex_list); - INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); - INIT_LIST_HEAD(&sdp->sd_rindex_recent_list); + sdp->sd_rindex_tree.rb_node = NULL; INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); mutex_init(&sdp->sd_jindex_mutex); + init_completion(&sdp->sd_journal_ready); INIT_LIST_HEAD(&sdp->sd_quota_list); - spin_lock_init(&sdp->sd_quota_spin); mutex_init(&sdp->sd_quota_mutex); + mutex_init(&sdp->sd_quota_sync_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + INIT_LIST_HEAD(&sdp->sd_trunc_list); + spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_rgrp_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); - - INIT_LIST_HEAD(&sdp->sd_log_le_buf); + atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); - INIT_LIST_HEAD(&sdp->sd_log_le_rg); - INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + spin_lock_init(&sdp->sd_ordered_lock); - mutex_init(&sdp->sd_log_reserve_mutex); + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); init_waitqueue_head(&sdp->sd_log_flush_wait); + init_waitqueue_head(&sdp->sd_log_frozen_wait); + atomic_set(&sdp->sd_log_freeze, 0); + atomic_set(&sdp->sd_frozen_root, 0); + init_waitqueue_head(&sdp->sd_frozen_root_wait); - INIT_LIST_HEAD(&sdp->sd_revoke_list); + return sdp; +} - mutex_init(&sdp->sd_freeze_lock); - return sdp; +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + pr_warn("not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + + return -EINVAL; } -static void init_vfs(struct super_block *sb, unsigned noatime) +static void end_bio_io_page(struct bio *bio, int error) { - struct gfs2_sbd *sdp = sb->s_fs_info; + struct page *page = bio->bi_private; - sb->s_magic = GFS2_MAGIC; - sb->s_op = &gfs2_super_ops; - sb->s_export_op = &gfs2_export_ops; - sb->s_time_gran = 1; - sb->s_maxbytes = MAX_LFS_FILESIZE; + if (!error) + SetPageUptodate(page); + else + pr_warn("error %d reading superblock\n", error); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + struct super_block *s = sdp->sd_vfs; + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); + memcpy(s->s_uuid, str->sb_uuid, 16); +} - if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(noatime, &sdp->sd_flags); +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @error: The error code to return + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - sb->s_flags |= MS_NOATIME | MS_NODIRATIME; +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOMEM; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | REQ_META, bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(sdp, p); + kunmap(page); + __free_page(page); + return gfs2_check_sb(sdp, silent); +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; } static int init_names(struct gfs2_sbd *sdp, int silent) @@ -126,14 +367,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) /* Try to autodetect */ if (!proto[0] || !table[0]) { - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); if (error) return error; - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - goto out; - if (!proto[0]) proto = sdp->sd_sb.sb_lockproto; if (!table[0]) @@ -143,38 +380,24 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id; - snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); - snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); + strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); + strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); table = sdp->sd_table_name; while ((table = strchr(table, '/'))) *table = '_'; -out: return error; } static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, int undo) { - struct task_struct *p; int error = 0; if (undo) goto fail_trans; - for (sdp->sd_glockd_num = 0; - sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; - sdp->sd_glockd_num++) { - p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start glockd thread: %d\n", error); - goto fail; - } - sdp->sd_glockd_process[sdp->sd_glockd_num] = p; - } - error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, @@ -201,18 +424,17 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, goto fail_live; } - error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, - CREATE, &sdp->sd_trans_gl); + error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, + CREATE, &sdp->sd_freeze_gl); if (error) { fs_err(sdp, "can't create transaction glock: %d\n", error); goto fail_rename; } - set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); return 0; fail_trans: - gfs2_glock_put(sdp->sd_trans_gl); + gfs2_glock_put(sdp->sd_freeze_gl); fail_rename: gfs2_glock_put(sdp->sd_rename_gl); fail_live: @@ -220,57 +442,60 @@ fail_live: fail_mount: gfs2_glock_dq_uninit(mount_gh); fail: - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); - return error; } -static inline struct inode *gfs2_lookup_root(struct super_block *sb, - u64 no_addr) +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) { - return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_make_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + return -ENOMEM; + } + *dptr = dentry; + return 0; } -static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) +static int init_sb(struct gfs2_sbd *sdp, int silent) { struct super_block *sb = sdp->sd_vfs; struct gfs2_holder sb_gh; u64 no_addr; - struct inode *inode; - int error = 0; + int ret; - if (undo) { - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } - return 0; - } - - error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, - LM_ST_SHARED, 0, &sb_gh); - if (error) { - fs_err(sdp, "can't acquire superblock glock: %d\n", error); - return error; + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; } - error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); - if (error) { - fs_err(sdp, "can't read superblock: %d\n", error); + ret = gfs2_read_sb(sdp, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); goto out; } /* Set up the buffer cache and SB for real */ - if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { - error = -EINVAL; + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too small for device " "block size (%u)\n", - sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); goto out; } if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { - error = -EINVAL; + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too big for machine " "page size (%u)\n", sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); @@ -280,109 +505,165 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) /* Get the root inode */ no_addr = sdp->sd_sb.sb_root_dir.no_addr; - if (sb->s_type == &gfs2meta_fs_type) - no_addr = sdp->sd_sb.sb_master_dir.no_addr; - inode = gfs2_lookup_root(sb, no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in root inode: %d\n", error); + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) goto out; - } - sb->s_root = d_alloc_root(inode); - if (!sb->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - sb->s_root->d_op = &gfs2_dops; - + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); out: gfs2_glock_dq_uninit(&sb_gh); - return error; + return ret; +} + +static void gfs2_others_may_mount(struct gfs2_sbd *sdp) +{ + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } /** - * map_journal_extents - create a reusable "extent" mapping from all logical - * blocks to all physical blocks for the given journal. This will save - * us time when writing journal blocks. Most journals will have only one - * extent that maps all their logical blocks. That's because gfs2.mkfs - * arranges the journal blocks sequentially to maximize performance. - * So the extent would map the first block for the entire file length. - * However, gfs2_jadd can happen while file activity is happening, so - * those journals may not be sequential. Less likely is the case where - * the users created their own journals by mounting the metafs and - * laying it out. But it's still possible. These journals might have - * several extents. + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock * - * TODO: This should be done in bigger chunks rather than one block at a time, - * but since it's only done at mount time, I'm not worried about the - * time it takes. + * Returns: errno */ -static int map_journal_extents(struct gfs2_sbd *sdp) + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) { - struct gfs2_jdesc *jd = sdp->sd_jdesc; - unsigned int lb; - u64 db, prev_db; /* logical block, disk block, prev disk block */ - struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_journal_extent *jext = NULL; - struct buffer_head bh; - int rc = 0; - - prev_db = 0; - - for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { - bh.b_state = 0; - bh.b_blocknr = 0; - bh.b_size = 1 << ip->i_inode.i_blkbits; - rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0); - db = bh.b_blocknr; - if (rc || !db) { - printk(KERN_INFO "GFS2 journal mapping error %d: lb=" - "%u db=%llu\n", rc, lb, (unsigned long long)db); + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; break; } - if (!prev_db || db != prev_db + 1) { - jext = kzalloc(sizeof(struct gfs2_journal_extent), - GFP_KERNEL); - if (!jext) { - printk(KERN_INFO "GFS2 error: out of memory " - "mapping journal extents.\n"); - rc = -ENOMEM; - break; - } - jext->dblock = db; - jext->lblock = lb; - jext->blocks = 1; - list_add_tail(&jext->extent_list, &jd->extent_list); - } else { - jext->blocks++; + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + INIT_LIST_HEAD(&jd->jd_revoke_list); + + INIT_WORK(&jd->jd_work, gfs2_recover_func); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; } - prev_db = db; + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +/** + * check_journal_clean - Make sure a journal is clean for a spectator mount + * @sdp: The GFS2 superblock + * @jd: The journal descriptor + * + * Returns: 0 if the journal is clean or locked, else an error + */ +static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + int error; + struct gfs2_holder j_gh; + struct gfs2_log_header_host head; + struct gfs2_inode *ip; + + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | + GL_EXACT | GL_NOCACHE, &j_gh); + if (error) { + fs_err(sdp, "Error locking journal for spectator mount.\n"); + return -EPERM; + } + error = gfs2_jdesc_check(jd); + if (error) { + fs_err(sdp, "Error checking journal for spectator mount.\n"); + goto out_unlock; + } + error = gfs2_find_jhead(jd, &head); + if (error) { + fs_err(sdp, "Error parsing journal for spectator mount.\n"); + goto out_unlock; + } + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EPERM; + fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter " + "must not be a spectator.\n", jd->jd_jid); } - return rc; + +out_unlock: + gfs2_glock_dq_uninit(&j_gh); + return error; } static int init_journal(struct gfs2_sbd *sdp, int undo) { + struct inode *master = sdp->sd_master_dir->d_inode; struct gfs2_holder ji_gh; - struct task_struct *p; struct gfs2_inode *ip; int jindex = 1; int error = 0; if (undo) { jindex = 0; - goto fail_recoverd; + goto fail_jinode_gh; } - sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); } - ip = GFS2_I(sdp->sd_jindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); /* Load in the journal index special file */ @@ -392,7 +673,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail; } - error = -EINVAL; + error = -EUSERS; if (!gfs2_jindex_size(sdp)) { fs_err(sdp, "no journals!\n"); goto fail_jindex; @@ -401,6 +682,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); } else { if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { fs_err(sdp, "can't mount journal #%u\n", @@ -438,15 +721,26 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jinode_gh; } atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ - map_journal_extents(sdp); + gfs2_map_journal_extents(sdp, sdp->sd_jdesc); } + trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { - error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x); + + if (sdp->sd_args.ar_spectator) { + error = check_journal_clean(sdp, jd); + if (error) + goto fail_jinode_gh; + continue; + } + error = gfs2_recover_journal(jd, true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); @@ -454,9 +748,9 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } } - gfs2_lm_others_may_mount(sdp); + gfs2_others_may_mount(sdp); } else if (!sdp->sd_args.ar_spectator) { - error = gfs2_recover_journal(sdp->sd_jdesc); + error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); goto fail_jinode_gh; @@ -466,19 +760,17 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - - p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start recoverd thread: %d\n", error); - goto fail_jinode_gh; + if (!sdp->sd_args.ar_spectator) { + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + fs_err(sdp, "can't acquire freeze glock: %d\n", error); + goto fail_jinode_gh; + } } - sdp->sd_recoverd_process = p; - + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); return 0; -fail_recoverd: - kthread_stop(sdp->sd_recoverd_process); fail_jinode_gh: if (!sdp->sd_args.ar_spectator) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); @@ -494,63 +786,56 @@ fail: return error; } +static struct lock_class_key gfs2_quota_imutex_key; static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct gfs2_inode *ip; - struct inode *inode; + struct inode *master = sdp->sd_master_dir->d_inode; if (undo) goto fail_qinode; - inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in master directory: %d\n", error); - goto fail; - } - sdp->sd_master_dir = inode; - error = init_journal(sdp, undo); + complete_all(&sdp->sd_journal_ready); if (error) - goto fail_master; - - /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); - if (IS_ERR(sdp->sd_inum_inode)) { - error = PTR_ERR(sdp->sd_inum_inode); - fs_err(sdp, "can't read in inum inode: %d\n", error); - goto fail_journal; - } - + goto fail; /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_inum; + goto fail_journal; } /* Read in the resource index inode */ - sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); goto fail_statfs; } - ip = GFS2_I(sdp->sd_rindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); - sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1; + sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ - sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); if (IS_ERR(sdp->sd_quota_inode)) { error = PTR_ERR(sdp->sd_quota_inode); fs_err(sdp, "can't get quota file inode: %d\n", error); goto fail_rindex; } + /* + * i_mutex on quota files is special. Since this inode is hidden system + * file, we are safe to define locking ourselves. + */ + lockdep_set_class(&sdp->sd_quota_inode->i_mutex, + &gfs2_quota_imutex_key); + + error = gfs2_rindex_update(sdp); + if (error) + goto fail_qinode; + return 0; fail_qinode: @@ -560,12 +845,8 @@ fail_rindex: iput(sdp->sd_rindex); fail_statfs: iput(sdp->sd_statfs_inode); -fail_inum: - iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); -fail_master: - iput(sdp->sd_master_dir); fail: return error; } @@ -576,6 +857,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; + struct inode *master = sdp->sd_master_dir->d_inode; if (sdp->sd_args.ar_spectator) return 0; @@ -583,27 +865,19 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qc_gh; - pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); + pn = gfs2_lookup_simple(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); return error; } - sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); - sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_ir_inode)) { - error = PTR_ERR(sdp->sd_ir_inode); - fs_err(sdp, "can't find local \"ir\" file: %d\n", error); - goto fail; - } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_sc_inode)) { error = PTR_ERR(sdp->sd_sc_inode); fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail_ir_i; + goto fail; } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); @@ -617,27 +891,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_ir_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, - &sdp->sd_ir_gh); - if (error) { - fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_ir_gh; + goto fail_qc_i; } ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -650,58 +913,142 @@ fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); fail_ut_gh: gfs2_glock_dq_uninit(&sdp->sd_sc_gh); -fail_ir_gh: - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: iput(sdp->sd_sc_inode); -fail_ir_i: - iput(sdp->sd_ir_inode); fail: if (pn) iput(pn); return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; +static const match_table_t nolock_tokens = { + { Opt_jid, "jid=%d\n", }, + { Opt_err, NULL }, +}; - if (undo) - goto fail_quotad; +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_put_lock = gfs2_glock_free, + .lm_tokens = &nolock_tokens, +}; - sdp->sd_log_flush_time = jiffies; - sdp->sd_jindex_refresh_time = jiffies; +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguments + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; +static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + const struct lm_lockops *lm; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + struct gfs2_args *args = &sdp->sd_args; + const char *proto = sdp->sd_proto_name; + const char *table = sdp->sd_table_name; + char *o, *options; + int ret; + + if (!strcmp("lock_nolock", proto)) { + lm = &nolock_ops; + sdp->sd_args.ar_localflocks = 1; +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + } else if (!strcmp("lock_dlm", proto)) { + lm = &gfs2_dlm_ops; +#endif + } else { + pr_info("can't find protocol %s\n", proto); + return -ENOENT; } - sdp->sd_logd_process = p; - sdp->sd_statfs_sync_time = jiffies; - sdp->sd_quota_sync_time = jiffies; + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; + ls->ls_ops = lm; + ls->ls_first = 1; + + for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { + substring_t tmp[MAX_OPT_ARGS]; + int token, option; + + if (!o || !*o) + continue; + + token = match_token(o, *lm->lm_tokens, tmp); + switch (token) { + case Opt_jid: + ret = match_int(&tmp[0], &option); + if (ret || option < 0) + goto hostdata_error; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; + break; + case Opt_id: + case Opt_nodir: + /* Obsolete, but left for backward compat purposes */ + break; + case Opt_first: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_first = option; + break; + case Opt_err: + default: +hostdata_error: + fs_info(sdp, "unknown hostdata (%s)\n", o); + return -EINVAL; + } + } + + if (lm->lm_mount == NULL) { + fs_info(sdp, "Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return 0; } - sdp->sd_quotad_process = p; + ret = lm->lm_mount(sdp, table); + if (ret == 0) + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + complete_all(&sdp->sd_locking_init); + return ret; +} +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + lm->lm_unmount) + lm->lm_unmount(sdp); +} + +static int gfs2_journalid_wait(void *word) +{ + if (signal_pending(current)) + return -EINTR; + schedule(); return 0; +} +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); +} + +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); } /** @@ -713,7 +1060,7 @@ fail: * Returns: errno */ -static int fill_super(struct super_block *sb, void *data, int silent) +static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) { struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; @@ -721,17 +1068,30 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp = init_sbd(sb); if (!sdp) { - printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + pr_warn("can't alloc struct gfs2_sbd\n"); return -ENOMEM; } + sdp->sd_args = *args; - error = gfs2_mount_args(sdp, (char *)data, 0); - if (error) { - printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); - goto fail; + if (sdp->sd_args.ar_spectator) { + sb->s_flags |= MS_RDONLY; + set_bit(SDF_RORECOVERY, &sdp->sd_flags); } + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - init_vfs(sb, SDF_NOATIME); + sb->s_flags |= MS_NOSEC; + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ @@ -741,28 +1101,78 @@ static int fill_super(struct super_block *sb, void *data, int silent) GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; + if (sdp->sd_args.ar_statfs_quantum) { + sdp->sd_tune.gt_statfs_slow = 0; + sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; + } else { + sdp->sd_tune.gt_statfs_slow = 1; + sdp->sd_tune.gt_statfs_quantum = 30; + } + error = init_names(sdp, silent); - if (error) - goto fail; + if (error) { + /* In this case, we haven't initialized sysfs, so we have to + manually free the sdp. */ + free_percpu(sdp->sd_lkstats); + kfree(sdp); + sb->s_fs_info = NULL; + return error; + } - gfs2_create_debugfs_file(sdp); + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); + /* + * If we hit an error here, gfs2_sys_fs_add will have called function + * kobject_put which causes the sysfs usage count to go to zero, which + * causes sysfs to call function gfs2_sbd_release, which frees sdp. + * Subsequent error paths here will call gfs2_sys_fs_del, which also + * kobject_put to free sdp. + */ if (error) - goto fail; + return error; + + gfs2_create_debugfs_file(sdp); error = gfs2_lm_mount(sdp, silent); if (error) - goto fail_sys; + goto fail_debug; error = init_locking(sdp, &mount_gh, DO); if (error) goto fail_lm; - error = init_sb(sdp, silent, DO); + error = init_sb(sdp, silent); if (error) goto fail_locking; + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + error = init_inodes(sdp, DO); if (error) goto fail_sb; @@ -777,210 +1187,238 @@ static int fill_super(struct super_block *sb, void *data, int silent) goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } gfs2_glock_dq_uninit(&mount_gh); - + gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: init_inodes(sdp, UNDO); fail_sb: - init_sb(sdp, 0, UNDO); + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + if (sb->s_root) + dput(sb->s_root); + sb->s_root = NULL; fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - gfs2_gl_hash_clear(sdp, WAIT); + complete_all(&sdp->sd_journal_ready); + gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); - while (invalidate_inodes(sb)) - yield(); -fail_sys: - gfs2_sys_fs_del(sdp); -fail: +fail_debug: gfs2_delete_debugfs_file(sdp); - kfree(sdp); + free_percpu(sdp->sd_lkstats); + /* gfs2_sys_fs_del must be the last thing we do, since it causes + * sysfs to call function gfs2_sbd_release, which frees sdp. */ + gfs2_sys_fs_del(sdp); sb->s_fs_info = NULL; return error; } -static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) -{ - struct super_block *sb; - struct gfs2_sbd *sdp; - int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); - if (error) - goto out; - sb = mnt->mnt_sb; - sdp = sb->s_fs_info; - sdp->sd_gfs2mnt = mnt; -out: - return error; -} - -static int fill_super_meta(struct super_block *sb, struct super_block *new, - void *data, int silent) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct inode *inode; - int error = 0; - - new->s_fs_info = sdp; - sdp->sd_vfs_meta = sb; - - init_vfs(new, SDF_NOATIME); - - /* Get the master inode */ - inode = igrab(sdp->sd_master_dir); - - new->s_root = d_alloc_root(inode); - if (!new->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - new->s_root->d_op = &gfs2_dops; - - return error; -} - -static int set_bdev_super(struct super_block *s, void *data) +static int set_gfs2_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } -static int test_bdev_super(struct super_block *s, void *data) +static int test_gfs2_super(struct super_block *s, void *ptr) { - return s->s_bdev == data; + struct block_device *bdev = ptr; + return (bdev == s->s_bdev); } -static struct super_block* get_gfs2_sb(const char *dev_name) +/** + * gfs2_mount - Get the GFS2 superblock + * @fs_type: The GFS2 filesystem type + * @flags: Mount flags + * @dev_name: The name of the device + * @data: The mount arguments + * + * Q. Why not use get_sb_bdev() ? + * A. We need to select one of two root directories to mount, independent + * of whether this is the initial, or subsequent, mount of this sb + * + * Returns: 0 or -ve on error + */ + +static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - struct kstat stat; - struct nameidata nd; - struct file_system_type *fstype; - struct super_block *sb = NULL, *s; + struct block_device *bdev; + struct super_block *s; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error; + struct gfs2_args args; + struct gfs2_sbd *sdp; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + bdev = blkdev_get_by_path(dev_name, mode, fs_type); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = -EBUSY; + goto error_bdev; + } + s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + error = PTR_ERR(s); + if (IS_ERR(s)) + goto error_bdev; + + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); + blkdev_put(bdev, mode); + down_write(&s->s_umount); + } + + memset(&args, 0, sizeof(args)); + args.ar_quota = GFS2_QUOTA_DEFAULT; + args.ar_data = GFS2_DATA_DEFAULT; + args.ar_commit = 30; + args.ar_statfs_quantum = 30; + args.ar_quota_quantum = 60; + args.ar_errors = GFS2_ERRORS_DEFAULT; - error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + error = gfs2_mount_args(&args, data); if (error) { - printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", - dev_name); - goto out; + pr_warn("can't parse mount arguments\n"); + goto error_super; } - error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat); - fstype = get_fs_type("gfs2"); - list_for_each_entry(s, &fstype->fs_supers, s_instances) { - if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || - (S_ISDIR(stat.mode) && - s == nd.path.dentry->d_inode->i_sb)) { - sb = s; - goto free_nd; - } + if (s->s_root) { + error = -EBUSY; + if ((flags ^ s->s_flags) & MS_RDONLY) + goto error_super; + } else { + char b[BDEVNAME_SIZE]; + + s->s_mode = mode; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(bdev)); + error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + if (error) + goto error_super; + s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; } - printk(KERN_WARNING "GFS2: Unrecognized block device or " - "mount point %s\n", dev_name); + sdp = s->s_fs_info; + if (args.ar_meta) + return dget(sdp->sd_master_dir); + else + return dget(sdp->sd_root_dir); + +error_super: + deactivate_locked_super(s); + return ERR_PTR(error); +error_bdev: + blkdev_put(bdev, mode); + return ERR_PTR(error); +} -free_nd: - path_put(&nd.path); -out: - return sb; +static int set_meta_super(struct super_block *s, void *ptr) +{ + return -EINVAL; } -static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - int error = 0; - struct super_block *sb = NULL, *new; + struct super_block *s; struct gfs2_sbd *sdp; + struct path path; + int error; - sb = get_gfs2_sb(dev_name); - if (!sb) { - printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - error = -ENOENT; - goto error; - } - sdp = sb->s_fs_info; - if (sdp->sd_vfs_meta) { - printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); - error = -EBUSY; - goto error; - } - down(&sb->s_bdev->bd_mount_sem); - new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); - up(&sb->s_bdev->bd_mount_sem); - if (IS_ERR(new)) { - error = PTR_ERR(new); - goto error; - } - module_put(fs_type->owner); - new->s_flags = flags; - strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); - sb_set_blocksize(new, sb->s_blocksize); - error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0); + error = kern_path(dev_name, LOOKUP_FOLLOW, &path); if (error) { - up_write(&new->s_umount); - deactivate_super(new); - goto error; + pr_warn("path_lookup on %s returned error %d\n", + dev_name, error); + return ERR_PTR(error); } - - new->s_flags |= MS_ACTIVE; - - /* Grab a reference to the gfs2 mount point */ - atomic_inc(&sdp->sd_gfs2mnt->mnt_count); - return simple_set_mnt(mnt, new); -error: - return error; + s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, + path.dentry->d_inode->i_sb->s_bdev); + path_put(&path); + if (IS_ERR(s)) { + pr_warn("gfs2 mount does not exist\n"); + return ERR_CAST(s); + } + if ((flags ^ s->s_flags) & MS_RDONLY) { + deactivate_locked_super(s); + return ERR_PTR(-EBUSY); + } + sdp = s->s_fs_info; + return dget(sdp->sd_master_dir); } static void gfs2_kill_sb(struct super_block *sb) { - if (sb->s_fs_info) { - gfs2_delete_debugfs_file(sb->s_fs_info); - gfs2_meta_syncfs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + + if (sdp == NULL) { + kill_block_super(sb); + return; } - kill_block_super(sb); -} -static void gfs2_kill_sb_meta(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - generic_shutdown_super(sb); - sdp->sd_vfs_meta = NULL; - atomic_dec(&sdp->sd_gfs2mnt->mnt_count); + gfs2_log_flush(sdp, NULL, SYNC_FLUSH); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; + shrink_dcache_sb(sb); + gfs2_delete_debugfs_file(sdp); + free_percpu(sdp->sd_lkstats); + kill_block_super(sb); } struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb, + .mount = gfs2_mount, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; +MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb_meta, - .kill_sb = gfs2_kill_sb_meta, + .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; - +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h deleted file mode 100644 index da849051183..00000000000 --- a/fs/gfs2/ops_fstype.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_FSTYPE_DOT_H__ -#define __OPS_FSTYPE_DOT_H__ - -#include <linux/fs.h> - -extern struct file_system_type gfs2_fs_type; -extern struct file_system_type gfs2meta_fs_type; -extern const struct export_operations gfs2_export_ops; - -#endif /* __OPS_FSTYPE_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c deleted file mode 100644 index e87412902be..00000000000 --- a/fs/gfs2/ops_inode.c +++ /dev/null @@ -1,1191 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/namei.h> -#include <linux/utsname.h> -#include <linux/mm.h> -#include <linux/xattr.h> -#include <linux/posix_acl.h> -#include <linux/gfs2_ondisk.h> -#include <linux/crc32.h> -#include <linux/lm_interface.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "bmap.h" -#include "dir.h" -#include "eaops.h" -#include "eattr.h" -#include "glock.h" -#include "inode.h" -#include "meta_io.h" -#include "ops_dentry.h" -#include "ops_inode.h" -#include "quota.h" -#include "rgrp.h" -#include "trans.h" -#include "util.h" - -/** - * gfs2_create - Create a file - * @dir: The directory in which to create the file - * @dentry: The dentry of the new file - * @mode: The mode of the new file - * - * Returns: errno - */ - -static int gfs2_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - for (;;) { - inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0); - if (!IS_ERR(inode)) { - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - gfs2_glock_dq_uninit_m(2, ghs); - mark_inode_dirty(inode); - break; - } else if (PTR_ERR(inode) != -EEXIST || - (nd && (nd->intent.open.flags & O_EXCL))) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); - if (inode) { - if (!IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - break; - } else { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - } - } - - d_instantiate(dentry, inode); - - return 0; -} - -/** - * gfs2_lookup - Look up a filename in a directory and return its inode - * @dir: The directory inode - * @dentry: The dentry of the new inode - * @nd: passed from Linux VFS, ignored by us - * - * Called by the VFS layer. Lock dir and call gfs2_lookupi() - * - * Returns: errno - */ - -static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - struct inode *inode = NULL; - - dentry->d_op = &gfs2_dops; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); - if (inode && IS_ERR(inode)) - return ERR_CAST(inode); - - if (inode) { - struct gfs2_glock *gl = GFS2_I(inode)->i_gl; - struct gfs2_holder gh; - int error; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) { - iput(inode); - return ERR_PTR(error); - } - gfs2_glock_dq_uninit(&gh); - return d_splice_alias(inode, dentry); - } - d_add(dentry, inode); - - return NULL; -} - -/** - * gfs2_link - Link to a file - * @old_dentry: The inode to link - * @dir: Add link to this directory - * @dentry: The name of the link - * - * Link the inode in "old_dentry" into the directory "dir" with the - * name in "dentry". - * - * Returns: errno - */ - -static int gfs2_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = old_dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder ghs[2]; - int alloc_required; - int error; - - if (S_ISDIR(inode->i_mode)) - return -EPERM; - - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - error = gfs2_glock_nq_m(2, ghs); - if (error) - goto out; - - error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); - if (error) - goto out_gunlock; - - error = gfs2_dir_check(dir, &dentry->d_name, NULL); - switch (error) { - case -ENOENT: - break; - case 0: - error = -EEXIST; - default: - goto out_gunlock; - } - - error = -EINVAL; - if (!dip->i_inode.i_nlink) - goto out_gunlock; - error = -EFBIG; - if (dip->i_di.di_entries == (u32)-1) - goto out_gunlock; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_gunlock; - error = -EINVAL; - if (!ip->i_inode.i_nlink) - goto out_gunlock; - error = -EMLINK; - if (ip->i_inode.i_nlink == (u32)-1) - goto out_gunlock; - - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); - if (error < 0) - goto out_gunlock; - error = 0; - - if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(dip); - - error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out_alloc; - - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); - if (error) - goto out_ipres; - } else { - error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); - if (error) - goto out_ipres; - } - - error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode)); - if (error) - goto out_end_trans; - - error = gfs2_change_nlink(ip, +1); - -out_end_trans: - gfs2_trans_end(sdp); -out_ipres: - if (alloc_required) - gfs2_inplace_release(dip); -out_gunlock_q: - if (alloc_required) - gfs2_quota_unlock(dip); -out_alloc: - if (alloc_required) - gfs2_alloc_put(dip); -out_gunlock: - gfs2_glock_dq_m(2, ghs); -out: - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); - if (!error) { - atomic_inc(&inode->i_count); - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - } - return error; -} - -/** - * gfs2_unlink - Unlink a file - * @dir: The inode of the directory containing the file to unlink - * @dentry: The file itself - * - * Unlink a file. Call gfs2_unlinki() - * - * Returns: errno - */ - -static int gfs2_unlink(struct inode *dir, struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[3]; - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; - int error; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - - - error = gfs2_glock_nq(ghs); /* parent */ - if (error) - goto out_parent; - - error = gfs2_glock_nq(ghs + 1); /* child */ - if (error) - goto out_child; - - error = gfs2_glock_nq(ghs + 2); /* rgrp */ - if (error) - goto out_rgrp; - - error = gfs2_unlink_ok(dip, &dentry->d_name, ip); - if (error) - goto out_rgrp; - - error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_rgrp; - - error = gfs2_dir_del(dip, &dentry->d_name); - if (error) - goto out_end_trans; - - error = gfs2_change_nlink(ip, -1); - -out_end_trans: - gfs2_trans_end(sdp); - gfs2_glock_dq(ghs + 2); -out_rgrp: - gfs2_holder_uninit(ghs + 2); - gfs2_glock_dq(ghs + 1); -out_child: - gfs2_holder_uninit(ghs + 1); - gfs2_glock_dq(ghs); -out_parent: - gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); - return error; -} - -/** - * gfs2_symlink - Create a symlink - * @dir: The directory to create the symlink in - * @dentry: The dentry to put the symlink in - * @symname: The thing which the link points to - * - * Returns: errno - */ - -static int gfs2_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct gfs2_inode *dip = GFS2_I(dir), *ip; - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - struct buffer_head *dibh; - int size; - int error; - - /* Must be stuffed with a null terminator for gfs2_follow_link() */ - size = strlen(symname); - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) - return -ENAMETOOLONG; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - ip = ghs[1].gh_gl->gl_object; - - ip->i_di.di_size = size; - - error = gfs2_meta_inode_buffer(ip, &dibh); - - if (!gfs2_assert_withdraw(sdp, !error)) { - gfs2_dinode_out(ip, dibh->b_data); - memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, - size); - brelse(dibh); - } - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/** - * gfs2_mkdir - Make a directory - * @dir: The parent directory of the new one - * @dentry: The dentry of the new directory - * @mode: The mode of the new directory - * - * Returns: errno - */ - -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct gfs2_inode *dip = GFS2_I(dir), *ip; - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - struct buffer_head *dibh; - int error; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - ip = ghs[1].gh_gl->gl_object; - - ip->i_inode.i_nlink = 2; - ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); - ip->i_di.di_flags |= GFS2_DIF_JDATA; - ip->i_di.di_entries = 2; - - error = gfs2_meta_inode_buffer(ip, &dibh); - - if (!gfs2_assert_withdraw(sdp, !error)) { - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); - struct qstr str; - - gfs2_str2qstr(&str, "."); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); - dent->de_inum = di->di_num; /* already GFS2 endian */ - dent->de_type = cpu_to_be16(DT_DIR); - di->di_entries = cpu_to_be32(1); - - gfs2_str2qstr(&str, ".."); - dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); - gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); - - gfs2_inum_out(dip, dent); - dent->de_type = cpu_to_be16(DT_DIR); - - gfs2_dinode_out(ip, di); - - brelse(dibh); - } - - error = gfs2_change_nlink(dip, +1); - gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/** - * gfs2_rmdir - Remove a directory - * @dir: The parent directory of the directory to be removed - * @dentry: The dentry of the directory to remove - * - * Remove a directory. Call gfs2_rmdiri() - * - * Returns: errno - */ - -static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[3]; - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; - int error; - - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - - error = gfs2_glock_nq_m(3, ghs); - if (error) - goto out; - - error = gfs2_unlink_ok(dip, &dentry->d_name, ip); - if (error) - goto out_gunlock; - - if (ip->i_di.di_entries < 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - error = -EIO; - goto out_gunlock; - } - if (ip->i_di.di_entries > 2) { - error = -ENOTEMPTY; - goto out_gunlock; - } - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_gunlock; - - error = gfs2_rmdiri(dip, &dentry->d_name, ip); - - gfs2_trans_end(sdp); - -out_gunlock: - gfs2_glock_dq_m(3, ghs); -out: - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); - gfs2_holder_uninit(ghs + 2); - gfs2_glock_dq_uninit(&ri_gh); - return error; -} - -/** - * gfs2_mknod - Make a special file - * @dir: The directory in which the special file will reside - * @dentry: The dentry of the special file - * @mode: The mode of the special file - * @rdev: The device specification of the special file - * - */ - -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, - dev_t dev) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, mode, dev); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/** - * gfs2_rename - Rename a file - * @odir: Parent directory of old file name - * @odentry: The old dentry of the file - * @ndir: Parent directory of new file name - * @ndentry: The new dentry of the file - * - * Returns: errno - */ - -static int gfs2_rename(struct inode *odir, struct dentry *odentry, - struct inode *ndir, struct dentry *ndentry) -{ - struct gfs2_inode *odip = GFS2_I(odir); - struct gfs2_inode *ndip = GFS2_I(ndir); - struct gfs2_inode *ip = GFS2_I(odentry->d_inode); - struct gfs2_inode *nip = NULL; - struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh; - struct gfs2_rgrpd *nrgd; - unsigned int num_gh; - int dir_rename = 0; - int alloc_required; - unsigned int x; - int error; - - if (ndentry->d_inode) { - nip = GFS2_I(ndentry->d_inode); - if (ip == nip) - return 0; - } - - /* Make sure we aren't trying to move a dirctory into it's subdir */ - - if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) { - dir_rename = 1; - - error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, - &r_gh); - if (error) - goto out; - - error = gfs2_ok_to_move(ip, ndip); - if (error) - goto out_gunlock_r; - } - - num_gh = 1; - gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - if (odip != ndip) { - gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - } - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - - if (nip) { - gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - /* grab the resource lock for unlink flag twiddling - * this is the case of the target file already existing - * so we unlink before doing the rename - */ - nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); - if (nrgd) - gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); - } - - error = gfs2_glock_nq_m(num_gh, ghs); - if (error) - goto out_uninit; - - /* Check out the old directory */ - - error = gfs2_unlink_ok(odip, &odentry->d_name, ip); - if (error) - goto out_gunlock; - - /* Check out the new directory */ - - if (nip) { - error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); - if (error) - goto out_gunlock; - - if (S_ISDIR(nip->i_inode.i_mode)) { - if (nip->i_di.di_entries < 2) { - if (gfs2_consist_inode(nip)) - gfs2_dinode_print(nip); - error = -EIO; - goto out_gunlock; - } - if (nip->i_di.di_entries > 2) { - error = -ENOTEMPTY; - goto out_gunlock; - } - } - } else { - error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); - if (error) - goto out_gunlock; - - error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); - switch (error) { - case -ENOENT: - error = 0; - break; - case 0: - error = -EEXIST; - default: - goto out_gunlock; - }; - - if (odip != ndip) { - if (!ndip->i_inode.i_nlink) { - error = -EINVAL; - goto out_gunlock; - } - if (ndip->i_di.di_entries == (u32)-1) { - error = -EFBIG; - goto out_gunlock; - } - if (S_ISDIR(ip->i_inode.i_mode) && - ndip->i_inode.i_nlink == (u32)-1) { - error = -EMLINK; - goto out_gunlock; - } - } - } - - /* Check out the dir to be renamed */ - - if (dir_rename) { - error = permission(odentry->d_inode, MAY_WRITE, NULL); - if (error) - goto out_gunlock; - } - - alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - if (error < 0) - goto out_gunlock; - error = 0; - - if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(ndip); - - error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out_alloc; - - error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid); - if (error) - goto out_gunlock_q; - - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(ndip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - al->al_rgd->rd_length + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); - if (error) - goto out_ipreserv; - } else { - error = gfs2_trans_begin(sdp, 4 * RES_DINODE + - 5 * RES_LEAF + 4, 0); - if (error) - goto out_gunlock; - } - - /* Remove the target file, if it exists */ - - if (nip) { - if (S_ISDIR(nip->i_inode.i_mode)) - error = gfs2_rmdiri(ndip, &ndentry->d_name, nip); - else { - error = gfs2_dir_del(ndip, &ndentry->d_name); - if (error) - goto out_end_trans; - error = gfs2_change_nlink(nip, -1); - } - if (error) - goto out_end_trans; - } - - if (dir_rename) { - struct qstr name; - gfs2_str2qstr(&name, ".."); - - error = gfs2_change_nlink(ndip, +1); - if (error) - goto out_end_trans; - error = gfs2_change_nlink(odip, -1); - if (error) - goto out_end_trans; - - error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); - if (error) - goto out_end_trans; - } else { - struct buffer_head *dibh; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } - - error = gfs2_dir_del(odip, &odentry->d_name); - if (error) - goto out_end_trans; - - error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode)); - if (error) - goto out_end_trans; - -out_end_trans: - gfs2_trans_end(sdp); -out_ipreserv: - if (alloc_required) - gfs2_inplace_release(ndip); -out_gunlock_q: - if (alloc_required) - gfs2_quota_unlock(ndip); -out_alloc: - if (alloc_required) - gfs2_alloc_put(ndip); -out_gunlock: - gfs2_glock_dq_m(num_gh, ghs); -out_uninit: - for (x = 0; x < num_gh; x++) - gfs2_holder_uninit(ghs + x); -out_gunlock_r: - if (dir_rename) - gfs2_glock_dq_uninit(&r_gh); -out: - return error; -} - -/** - * gfs2_readlink - Read the value of a symlink - * @dentry: the symlink - * @buf: the buffer to read the symlink data into - * @size: the size of the buffer - * - * Returns: errno - */ - -static int gfs2_readlink(struct dentry *dentry, char __user *user_buf, - int user_size) -{ - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - char array[GFS2_FAST_NAME_SIZE], *buf = array; - unsigned int len = GFS2_FAST_NAME_SIZE; - int error; - - error = gfs2_readlinki(ip, &buf, &len); - if (error) - return error; - - if (user_size > len - 1) - user_size = len - 1; - - if (copy_to_user(user_buf, buf, user_size)) - error = -EFAULT; - else - error = user_size; - - if (buf != array) - kfree(buf); - - return error; -} - -/** - * gfs2_follow_link - Follow a symbolic link - * @dentry: The dentry of the link - * @nd: Data that we pass to vfs_follow_link() - * - * This can handle symlinks of any size. It is optimised for symlinks - * under GFS2_FAST_NAME_SIZE. - * - * Returns: 0 on success or error code - */ - -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - char array[GFS2_FAST_NAME_SIZE], *buf = array; - unsigned int len = GFS2_FAST_NAME_SIZE; - int error; - - error = gfs2_readlinki(ip, &buf, &len); - if (!error) { - error = vfs_follow_link(nd, buf); - if (buf != array) - kfree(buf); - } - - return ERR_PTR(error); -} - -/** - * gfs2_permission - - * @inode: - * @mask: - * @nd: passed from Linux VFS, ignored by us - * - * This may be called from the VFS directly, or from within GFS2 with the - * inode locked, so we look to see if the glock is already locked and only - * lock the glock if its not already been done. - * - * Returns: errno - */ - -static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; - int error; - int unlock = 0; - - if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - unlock = 1; - } - - error = generic_permission(inode, mask, gfs2_check_acl); - if (unlock) - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -static int setattr_size(struct inode *inode, struct iattr *attr) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; - - if (attr->ia_size != ip->i_di.di_size) { - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - return error; - error = vmtruncate(inode, attr->ia_size); - gfs2_trans_end(sdp); - if (error) - return error; - } - - error = gfs2_truncatei(ip, attr->ia_size); - if (error && (inode->i_size != ip->i_di.di_size)) - i_size_write(inode, ip->i_di.di_size); - - return error; -} - -static int setattr_chown(struct inode *inode, struct iattr *attr) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh; - u32 ouid, ogid, nuid, ngid; - int error; - - ouid = inode->i_uid; - ogid = inode->i_gid; - nuid = attr->ia_uid; - ngid = attr->ia_gid; - - if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) - ouid = nuid = NO_QUOTA_CHANGE; - if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) - ogid = ngid = NO_QUOTA_CHANGE; - - gfs2_alloc_get(ip); - - error = gfs2_quota_lock(ip, nuid, ngid); - if (error) - goto out_alloc; - - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { - error = gfs2_quota_check(ip, nuid, ngid); - if (error) - goto out_gunlock_q; - } - - error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); - if (error) - goto out_gunlock_q; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - - error = inode_setattr(inode, attr); - gfs2_assert_warn(sdp, !error); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { - gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid); - gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid); - } - -out_end_trans: - gfs2_trans_end(sdp); -out_gunlock_q: - gfs2_quota_unlock(ip); -out_alloc: - gfs2_alloc_put(ip); - return error; -} - -/** - * gfs2_setattr - Change attributes on an inode - * @dentry: The dentry which is changing - * @attr: The structure describing the change - * - * The VFS layer wants to change one or more of an inodes attributes. Write - * that change out to disk. - * - * Returns: errno - */ - -static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; - int error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; - - error = inode_change_ok(inode, attr); - if (error) - goto out; - - if (attr->ia_valid & ATTR_SIZE) - error = setattr_size(inode, attr); - else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) - error = setattr_chown(inode, attr); - else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) - error = gfs2_acl_chmod(ip, attr); - else - error = gfs2_setattr_simple(ip, attr); - -out: - gfs2_glock_dq_uninit(&i_gh); - if (!error) - mark_inode_dirty(inode); - return error; -} - -/** - * gfs2_getattr - Read out an inode's attributes - * @mnt: The vfsmount the inode is being accessed from - * @dentry: The dentry to stat - * @stat: The inode's stats - * - * This may be called from the VFS directly, or from within GFS2 with the - * inode locked, so we look to see if the glock is already locked and only - * lock the glock if its not already been done. Note that its the NFS - * readdirplus operation which causes this to be called (from filldir) - * with the glock already held. - * - * Returns: errno - */ - -static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - int unlock = 0; - - if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) - return error; - unlock = 1; - } - - generic_fillattr(inode, stat); - if (unlock) - gfs2_glock_dq_uninit(&gh); - - return 0; -} - -static int gfs2_setxattr(struct dentry *dentry, const char *name, - const void *data, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = (char *)data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - er.er_flags = flags; - - gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); - - return gfs2_ea_set(GFS2_I(inode), &er); -} - -static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, - void *data, size_t size) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - - return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); -} - -static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_data = (size) ? buffer : NULL; - er.er_data_len = size; - - return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); -} - -static int gfs2_removexattr(struct dentry *dentry, const char *name) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_name_len = strlen(er.er_name); - - return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); -} - -const struct inode_operations gfs2_file_iops = { - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, -}; - -const struct inode_operations gfs2_dev_iops = { - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, -}; - -const struct inode_operations gfs2_dir_iops = { - .create = gfs2_create, - .lookup = gfs2_lookup, - .link = gfs2_link, - .unlink = gfs2_unlink, - .symlink = gfs2_symlink, - .mkdir = gfs2_mkdir, - .rmdir = gfs2_rmdir, - .mknod = gfs2_mknod, - .rename = gfs2_rename, - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, -}; - -const struct inode_operations gfs2_symlink_iops = { - .readlink = gfs2_readlink, - .follow_link = gfs2_follow_link, - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, -}; - diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h deleted file mode 100644 index fd8cee231e1..00000000000 --- a/fs/gfs2/ops_inode.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_INODE_DOT_H__ -#define __OPS_INODE_DOT_H__ - -#include <linux/fs.h> - -extern const struct inode_operations gfs2_file_iops; -extern const struct inode_operations gfs2_dir_iops; -extern const struct inode_operations gfs2_symlink_iops; -extern const struct inode_operations gfs2_dev_iops; -extern const struct file_operations gfs2_file_fops; -extern const struct file_operations gfs2_dir_fops; -extern const struct file_operations gfs2_file_fops_nolock; -extern const struct file_operations gfs2_dir_fops_nolock; - -extern void gfs2_set_inode_flags(struct inode *inode); - -#endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c deleted file mode 100644 index 5e524217944..00000000000 --- a/fs/gfs2/ops_super.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/statfs.h> -#include <linux/seq_file.h> -#include <linux/mount.h> -#include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/crc32.h> -#include <linux/lm_interface.h> - -#include "gfs2.h" -#include "incore.h" -#include "glock.h" -#include "inode.h" -#include "lm.h" -#include "log.h" -#include "mount.h" -#include "ops_super.h" -#include "quota.h" -#include "recovery.h" -#include "rgrp.h" -#include "super.h" -#include "sys.h" -#include "util.h" -#include "trans.h" -#include "dir.h" -#include "eattr.h" -#include "bmap.h" - -/** - * gfs2_write_inode - Make sure the inode is stable on the disk - * @inode: The inode - * @sync: synchronous write flag - * - * Returns: errno - */ - -static int gfs2_write_inode(struct inode *inode, int sync) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - /* Check this is a "normal" inode */ - if (inode->i_private) { - if (current->flags & PF_MEMALLOC) - return 0; - if (sync) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - } - - return 0; -} - -/** - * gfs2_put_super - Unmount the filesystem - * @sb: The VFS superblock - * - */ - -static void gfs2_put_super(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - - if (!sdp) - return; - - if (!strncmp(sb->s_type->name, "gfs2meta", 8)) - return; /* Nothing to do */ - - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - kthread_stop(sdp->sd_recoverd_process); - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); - - if (!(sb->s_flags & MS_RDONLY)) { - error = gfs2_make_fs_ro(sdp); - if (error) - gfs2_io_error(sdp); - } - /* At this point, we're through modifying the disk */ - - /* Release stuff */ - - iput(sdp->sd_master_dir); - iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); - iput(sdp->sd_statfs_inode); - iput(sdp->sd_rindex); - iput(sdp->sd_quota_inode); - - gfs2_glock_put(sdp->sd_rename_gl); - gfs2_glock_put(sdp->sd_trans_gl); - - if (!sdp->sd_args.ar_spectator) { - gfs2_glock_dq_uninit(&sdp->sd_journal_gh); - gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); - gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); - iput(sdp->sd_sc_inode); - iput(sdp->sd_qc_inode); - } - - gfs2_glock_dq_uninit(&sdp->sd_live_gh); - gfs2_clear_rgrpd(sdp); - gfs2_jindex_free(sdp); - /* Take apart glock structures and buffer lists */ - gfs2_gl_hash_clear(sdp, WAIT); - /* Unmount the locking protocol */ - gfs2_lm_unmount(sdp); - - /* At this point, we're through participating in the lockspace */ - gfs2_sys_fs_del(sdp); - kfree(sdp); -} - -/** - * gfs2_write_super - * @sb: the superblock - * - */ - -static void gfs2_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - -/** - * gfs2_sync_fs - sync the filesystem - * @sb: the superblock - * - * Flushes the log to disk. - */ -static int gfs2_sync_fs(struct super_block *sb, int wait) -{ - sb->s_dirt = 0; - if (wait) - gfs2_log_flush(sb->s_fs_info, NULL); - return 0; -} - -/** - * gfs2_write_super_lockfs - prevent further writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static void gfs2_write_super_lockfs(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - - if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return; - - for (;;) { - error = gfs2_freeze_fs(sdp); - if (!error) - break; - - switch (error) { - case -EBUSY: - fs_err(sdp, "waiting for recovery before freeze\n"); - break; - - default: - fs_err(sdp, "error freezing FS: %d\n", error); - break; - } - - fs_err(sdp, "retrying...\n"); - msleep(1000); - } -} - -/** - * gfs2_unlockfs - reallow writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static void gfs2_unlockfs(struct super_block *sb) -{ - gfs2_unfreeze_fs(sb->s_fs_info); -} - -/** - * gfs2_statfs - Gather and return stats about the filesystem - * @sb: The superblock - * @statfsbuf: The buffer - * - * Returns: 0 on success or error code - */ - -static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_inode->i_sb; - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_statfs_change_host sc; - int error; - - if (gfs2_tune_get(sdp, gt_statfs_slow)) - error = gfs2_statfs_slow(sdp, &sc); - else - error = gfs2_statfs_i(sdp, &sc); - - if (error) - return error; - - buf->f_type = GFS2_MAGIC; - buf->f_bsize = sdp->sd_sb.sb_bsize; - buf->f_blocks = sc.sc_total; - buf->f_bfree = sc.sc_free; - buf->f_bavail = sc.sc_free; - buf->f_files = sc.sc_dinodes + sc.sc_free; - buf->f_ffree = sc.sc_free; - buf->f_namelen = GFS2_FNAMESIZE; - - return 0; -} - -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - - error = gfs2_mount_args(sdp, data, 1); - if (error) - return error; - - if (sdp->sd_args.ar_spectator) - *flags |= MS_RDONLY; - else { - if (*flags & MS_RDONLY) { - if (!(sb->s_flags & MS_RDONLY)) - error = gfs2_make_fs_ro(sdp); - } else if (!(*flags & MS_RDONLY) && - (sb->s_flags & MS_RDONLY)) { - error = gfs2_make_fs_rw(sdp); - } - } - - if (*flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(SDF_NOATIME, &sdp->sd_flags); - else - clear_bit(SDF_NOATIME, &sdp->sd_flags); - - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - *flags |= MS_NOATIME | MS_NODIRATIME; - - return error; -} - -/** - * gfs2_drop_inode - Drop an inode (test for remote unlink) - * @inode: The inode to drop - * - * If we've received a callback on an iopen lock then its because a - * remote node tried to deallocate the inode but failed due to this node - * still having the inode open. Here we mark the link count zero - * since we know that it must have reached zero if the GLF_DEMOTE flag - * is set on the iopen glock. If we didn't do a disk read since the - * remote node removed the final link then we might otherwise miss - * this event. This check ensures that this node will deallocate the - * inode's blocks, or alternatively pass the baton on to another - * node for later deallocation. - */ -static void gfs2_drop_inode(struct inode *inode) -{ - if (inode->i_private && inode->i_nlink) { - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) - clear_nlink(inode); - } - generic_drop_inode(inode); -} - -/** - * gfs2_clear_inode - Deallocate an inode when VFS is done with it - * @inode: The VFS inode - * - */ - -static void gfs2_clear_inode(struct inode *inode) -{ - /* This tells us its a "real" inode and not one which only - * serves to contain an address space (see rgrp.c, meta_io.c) - * which therefore doesn't have its own glocks. - */ - if (inode->i_private) { - struct gfs2_inode *ip = GFS2_I(inode); - ip->i_gl->gl_object = NULL; - gfs2_glock_schedule_for_reclaim(ip->i_gl); - gfs2_glock_put(ip->i_gl); - ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - } - } -} - -/** - * gfs2_show_options - Show mount options for /proc/mounts - * @s: seq_file structure - * @mnt: vfsmount - * - * Returns: 0 on success or error code - */ - -static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) -{ - struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; - struct gfs2_args *args = &sdp->sd_args; - - if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); - if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); - if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); - if (args->ar_spectator) - seq_printf(s, ",spectator"); - if (args->ar_ignore_local_fs) - seq_printf(s, ",ignore_local_fs"); - if (args->ar_localflocks) - seq_printf(s, ",localflocks"); - if (args->ar_localcaching) - seq_printf(s, ",localcaching"); - if (args->ar_debug) - seq_printf(s, ",debug"); - if (args->ar_upgrade) - seq_printf(s, ",upgrade"); - if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT) - seq_printf(s, ",num_glockd=%u", args->ar_num_glockd); - if (args->ar_posix_acl) - seq_printf(s, ",acl"); - if (args->ar_quota != GFS2_QUOTA_DEFAULT) { - char *state; - switch (args->ar_quota) { - case GFS2_QUOTA_OFF: - state = "off"; - break; - case GFS2_QUOTA_ACCOUNT: - state = "account"; - break; - case GFS2_QUOTA_ON: - state = "on"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",quota=%s", state); - } - if (args->ar_suiddir) - seq_printf(s, ",suiddir"); - if (args->ar_data != GFS2_DATA_DEFAULT) { - char *state; - switch (args->ar_data) { - case GFS2_DATA_WRITEBACK: - state = "writeback"; - break; - case GFS2_DATA_ORDERED: - state = "ordered"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",data=%s", state); - } - - return 0; -} - -/* - * We have to (at the moment) hold the inodes main lock to cover - * the gap between unlocking the shared lock on the iopen lock and - * taking the exclusive lock. I'd rather do a shared -> exclusive - * conversion on the iopen lock, but we can change that later. This - * is safe, just less efficient. - */ -static void gfs2_delete_inode(struct inode *inode) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - - if (!inode->i_private) - goto out; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (unlikely(error)) { - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; - } - - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) - goto out_uninit; - - if (S_ISDIR(inode->i_mode) && - (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) - goto out_unlock; - } - - if (ip->i_di.di_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) - goto out_unlock; - } - - if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) - goto out_unlock; - } - - error = gfs2_dinode_dealloc(ip); - if (error) - goto out_unlock; - - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - goto out_unlock; - /* Needs to be done before glock release & also in a transaction */ - truncate_inode_pages(&inode->i_data, 0); - gfs2_trans_end(sdp); - -out_unlock: - gfs2_glock_dq(&ip->i_iopen_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_iopen_gh); - gfs2_glock_dq_uninit(&gh); - if (error && error != GLR_TRYFAILED) - fs_warn(sdp, "gfs2_delete_inode: %d\n", error); -out: - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); -} - - - -static struct inode *gfs2_alloc_inode(struct super_block *sb) -{ - struct gfs2_inode *ip; - - ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); - if (ip) { - ip->i_flags = 0; - ip->i_gl = NULL; - } - return &ip->i_inode; -} - -static void gfs2_destroy_inode(struct inode *inode) -{ - kmem_cache_free(gfs2_inode_cachep, inode); -} - -const struct super_operations gfs2_super_ops = { - .alloc_inode = gfs2_alloc_inode, - .destroy_inode = gfs2_destroy_inode, - .write_inode = gfs2_write_inode, - .delete_inode = gfs2_delete_inode, - .put_super = gfs2_put_super, - .write_super = gfs2_write_super, - .sync_fs = gfs2_sync_fs, - .write_super_lockfs = gfs2_write_super_lockfs, - .unlockfs = gfs2_unlockfs, - .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, - .clear_inode = gfs2_clear_inode, - .drop_inode = gfs2_drop_inode, - .show_options = gfs2_show_options, -}; - diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h deleted file mode 100644 index 442a274c627..00000000000 --- a/fs/gfs2/ops_super.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_SUPER_DOT_H__ -#define __OPS_SUPER_DOT_H__ - -#include <linux/fs.h> - -extern const struct super_operations gfs2_super_ops; - -#endif /* __OPS_SUPER_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a08dabd6ce9..64b29f7f6b4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -15,7 +15,7 @@ * fuzziness in the current usage value of IDs that are being used on different * nodes in the cluster simultaneously. So, it is possible for a user on * multiple nodes to overrun their quota, but that overrun is controlable. - * Since quota tags are part of transactions, there is no need to a quota check + * Since quota tags are part of transactions, there is no need for a quota check * program to be run on node crashes or anything like that. * * There are couple of knobs that let the administrator manage the quota @@ -36,8 +36,11 @@ * the quota file, so it is not being constantly read. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> +#include <linux/mm.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> @@ -45,7 +48,17 @@ #include <linux/fs.h> #include <linux/bio.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/quota.h> +#include <linux/dqblk_xfs.h> +#include <linux/lockref.h> +#include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" @@ -59,201 +72,296 @@ #include "super.h" #include "trans.h" #include "inode.h" -#include "ops_address.h" #include "util.h" -#define QUOTA_USER 1 -#define QUOTA_GROUP 0 +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) -struct gfs2_quota_host { - u64 qu_limit; - u64 qu_warn; - s64 qu_value; - u32 qu_ll_next; -}; +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ +static DEFINE_SPINLOCK(qd_lock); +struct list_lru gfs2_qd_lru; + +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + +static void gfs2_qd_dispose(struct list_head *list) +{ + struct gfs2_quota_data *qd; + struct gfs2_sbd *sdp; + + while (!list_empty(list)) { + qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); + sdp = qd->qd_gl->gl_sbd; + + list_del(&qd->qd_lru); + + /* Free from the filesystem-specific list */ + spin_lock(&qd_lock); + list_del(&qd->qd_list); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); -struct gfs2_quota_change_host { - u64 qc_change; - u32 qc_flags; /* GFS2_QCF_... */ - u32 qc_id; + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_glock_put(qd->qd_gl); + atomic_dec(&sdp->sd_quota_count); + + /* Delete it from the common reclaim list */ + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); + } +} + + +static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +{ + struct list_head *dispose = arg; + struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + + if (!spin_trylock(&qd->qd_lockref.lock)) + return LRU_SKIP; + + if (qd->qd_lockref.count == 0) { + lockref_mark_dead(&qd->qd_lockref); + list_move(&qd->qd_lru, dispose); + } + + spin_unlock(&qd->qd_lockref.lock); + return LRU_REMOVED; +} + +static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, + &dispose, &sc->nr_to_scan); + + gfs2_qd_dispose(&dispose); + + return freed; +} + +static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); +} + +struct shrinker gfs2_qd_shrinker = { + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, }; + +static u64 qd2index(struct gfs2_quota_data *qd) +{ + struct kqid qid = qd->qd_id; + return (2 * (u64)from_kqid(&init_user_ns, qid)) + + ((qid.type == USRQUOTA) ? 0 : 1); +} + static u64 qd2offset(struct gfs2_quota_data *qd) { u64 offset; - offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset = qd2index(qd); offset *= sizeof(struct gfs2_quota); return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; - qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) - return -ENOMEM; + return NULL; - qd->qd_count = 1; - qd->qd_id = id; - if (user) - set_bit(QDF_USER, &qd->qd_flags); + qd->qd_sbd = sdp; + qd->qd_lockref.count = 1; + spin_lock_init(&qd->qd_lockref.lock); + qd->qd_id = qid; qd->qd_slot = -1; + INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; - error = gfs2_glock_get(sdp, 2 * (u64)id + !user, + error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; - error = gfs2_lvb_hold(qd->qd_gl); - gfs2_glock_put(qd->qd_gl); - if (error) - goto fail; - - *qdp = qd; - - return 0; + return qd; fail: - kfree(qd); - return error; + kmem_cache_free(gfs2_quotad_cachep, qd); + return NULL; } -static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) { - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; - int error, found; - - *qdp = NULL; + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; - for (;;) { - found = 0; - spin_lock(&sdp->sd_quota_spin); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qd->qd_id == id && - !test_bit(QDF_USER, &qd->qd_flags) == !user) { - qd->qd_count++; - found = 1; - break; - } + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; } + } - if (!found) - qd = NULL; + return NULL; +} - if (!qd && new_qd) { - qd = new_qd; - list_add(&qd->qd_list, &sdp->sd_quota_list); - atomic_inc(&sdp->sd_quota_count); - new_qd = NULL; - } - spin_unlock(&sdp->sd_quota_spin); +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); + + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); - if (qd || !create) { - if (new_qd) { - gfs2_lvb_unhold(new_qd->qd_gl); - kfree(new_qd); - } - *qdp = qd; - return 0; - } + if (qd) + return 0; - error = qd_alloc(sdp, user, id, &new_qd); - if (error) - return error; + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); + } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); } + + return 0; } + static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - - spin_lock(&sdp->sd_quota_spin); - gfs2_assert(sdp, qd->qd_count); - qd->qd_count++; - spin_unlock(&sdp->sd_quota_spin); + gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); + lockref_get(&qd->qd_lockref); } static void qd_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - spin_lock(&sdp->sd_quota_spin); - gfs2_assert(sdp, qd->qd_count); - if (!--qd->qd_count) - qd->qd_last_touched = jiffies; - spin_unlock(&sdp->sd_quota_spin); + if (lockref_put_or_lock(&qd->qd_lockref)) + return; + + qd->qd_lockref.count = 0; + list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + spin_unlock(&qd->qd_lockref.lock); + } static int slot_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - unsigned int c, o = 0, b; - unsigned char byte = 0; + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; - if (qd->qd_slot_count++) { - spin_unlock(&sdp->sd_quota_spin); - return 0; + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; + error = 0; +out: + qd->qd_slot_count++; } + spin_unlock(&sdp->sd_bitmap_lock); - for (c = 0; c < sdp->sd_quota_chunks; c++) - for (o = 0; o < PAGE_SIZE; o++) { - byte = sdp->sd_quota_bitmap[c][o]; - if (byte != 0xFF) - goto found; - } - - goto fail; - -found: - for (b = 0; b < 8; b++) - if (!(byte & (1 << b))) - break; - qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; - - if (qd->qd_slot >= sdp->sd_quota_slots) - goto fail; - - sdp->sd_quota_bitmap[c][o] |= 1 << b; - - spin_unlock(&sdp->sd_quota_spin); - - return 0; - -fail: - qd->qd_slot_count--; - spin_unlock(&sdp->sd_quota_spin); - return -ENOSPC; + return error; } static void slot_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -317,6 +425,24 @@ static void bh_put(struct gfs2_quota_data *qd) mutex_unlock(&sdp->sd_quota_mutex); } +static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, + u64 *sync_gen) +{ + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + (sync_gen && (qd->qd_sync_gen >= *sync_gen))) + return 0; + + if (!lockref_get_not_dead(&qd->qd_lockref)) + return 0; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + set_bit(QDF_LOCKED, &qd->qd_flags); + qd->qd_change_sync = qd->qd_change; + slot_hold(qd); + return 1; +} + static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL; @@ -328,31 +454,18 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) if (sdp->sd_vfs->s_flags & MS_RDONLY) return 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags) || - qd->qd_sync_gen >= sdp->sd_quota_sync_gen) - continue; - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, qd->qd_count); - qd->qd_count++; - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - found = 1; - - break; + found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen); + if (found) + break; } if (!found) qd = NULL; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lock); if (qd) { gfs2_assert_warn(sdp, qd->qd_change_sync); @@ -370,43 +483,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) return 0; } -static int qd_trylock(struct gfs2_quota_data *qd) -{ - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - - if (sdp->sd_vfs->s_flags & MS_RDONLY) - return 0; - - spin_lock(&sdp->sd_quota_spin); - - if (test_bit(QDF_LOCKED, &qd->qd_flags) || - !test_bit(QDF_CHANGE, &qd->qd_flags)) { - spin_unlock(&sdp->sd_quota_spin); - return 0; - } - - list_move_tail(&qd->qd_list, &sdp->sd_quota_list); - - set_bit(QDF_LOCKED, &qd->qd_flags); - gfs2_assert_warn(sdp, qd->qd_count); - qd->qd_count++; - qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; - - spin_unlock(&sdp->sd_quota_spin); - - gfs2_assert_warn(sdp, qd->qd_change_sync); - if (bh_get(qd)) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); - return 0; - } - - return 1; -} - static void qd_unlock(struct gfs2_quota_data *qd) { gfs2_assert_warn(qd->qd_gl->gl_sbd, @@ -417,12 +493,12 @@ static void qd_unlock(struct gfs2_quota_data *qd) qd_put(qd); } -static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create, +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { int error; - error = qd_get(sdp, user, id, create, qdp); + error = qd_get(sdp, qid, qdp); if (error) return error; @@ -450,45 +526,54 @@ static void qdsb_put(struct gfs2_quota_data *qd) qd_put(qd); } -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_quota_data **qd = al->al_qd; + struct gfs2_quota_data **qd; int error; - if (gfs2_assert_warn(sdp, !al->al_qd_num) || + if (ip->i_res == NULL) { + error = gfs2_rs_alloc(ip); + if (error) + return error; + } + + qd = ip->i_res->rs_qa_qd; + + if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, CREATE, qd); + error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out; - al->al_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; - error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, CREATE, qd); + error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out; - al->al_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; - if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { - error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd); + if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && + !uid_eq(uid, ip->i_inode.i_uid)) { + error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out; - al->al_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; } - if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) { - error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd); + if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && + !gid_eq(gid, ip->i_inode.i_gid)) { + error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out; - al->al_qd_num++; + ip->i_res->rs_qa_qd_num++; qd++; } @@ -501,16 +586,17 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; unsigned int x; + if (ip->i_res == NULL) + return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < al->al_qd_num; x++) { - qdsb_put(al->al_qd[x]); - al->al_qd[x] = NULL; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qdsb_put(ip->i_res->rs_qa_qd[x]); + ip->i_res->rs_qa_qd[x] = NULL; } - al->al_qd_num = 0; + ip->i_res->rs_qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -518,18 +604,10 @@ static int sort_qd(const void *a, const void *b) const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; - if (!test_bit(QDF_USER, &qd_a->qd_flags) != - !test_bit(QDF_USER, &qd_b->qd_flags)) { - if (test_bit(QDF_USER, &qd_a->qd_flags)) - return -1; - else - return 1; - } - if (qd_a->qd_id < qd_b->qd_id) + if (qid_lt(qd_a->qd_id, qd_b->qd_id)) return -1; - if (qd_a->qd_id > qd_b->qd_id) + if (qid_lt(qd_b->qd_id, qd_a->qd_id)) return 1; - return 0; } @@ -541,22 +619,22 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) s64 x; mutex_lock(&sdp->sd_quota_mutex); - gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { qc->qc_change = 0; qc->qc_flags = 0; - if (test_bit(QDF_USER, &qd->qd_flags)) + if (qd->qd_id.type == USRQUOTA) qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); - qc->qc_id = cpu_to_be32(qd->qd_id); + qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); } x = be64_to_cpu(qc->qc_change) + change; qc->qc_change = cpu_to_be64(x); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lock); qd->qd_change = x; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lock); if (!x) { gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); @@ -573,60 +651,71 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) mutex_unlock(&sdp->sd_quota_mutex); } -static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf) -{ - const struct gfs2_quota *str = buf; - - qu->qu_limit = be64_to_cpu(str->qu_limit); - qu->qu_warn = be64_to_cpu(str->qu_warn); - qu->qu_value = be64_to_cpu(str->qu_value); - qu->qu_ll_next = be32_to_cpu(str->qu_ll_next); -} - -static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) -{ - struct gfs2_quota *str = buf; - - str->qu_limit = cpu_to_be64(qu->qu_limit); - str->qu_warn = cpu_to_be64(qu->qu_warn); - str->qu_value = cpu_to_be64(qu->qu_value); - str->qu_ll_next = cpu_to_be32(qu->qu_ll_next); - memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); -} - /** - * gfs2_adjust_quota + * gfs2_adjust_quota - adjust record of current block usage + * @ip: The quota inode + * @loc: Offset of the entry in the quota file + * @change: The amount of usage change to record + * @qd: The quota data + * @fdq: The updated limits to record * * This function was mostly borrowed from gfs2_block_truncate_page which was * in turn mostly borrowed from ext3 + * + * Returns: 0 or -ve on error */ + static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, - s64 change, struct gfs2_quota_data *qd) + s64 change, struct gfs2_quota_data *qd, + struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, pos; struct buffer_head *bh; struct page *page; - void *kaddr; - char *ptr; - struct gfs2_quota_host qp; - s64 value; - int err = -EIO; + void *kaddr, *ptr; + struct gfs2_quota q; + int err, nbytes; + u64 size; if (gfs2_is_stuffed(ip)) { - struct gfs2_alloc *al = NULL; - al = gfs2_alloc_get(ip); - /* just request 1 blk */ - al->al_requested = 1; - gfs2_inplace_reserve(ip); - gfs2_unstuff_dinode(ip, NULL); - gfs2_inplace_release(ip); - gfs2_alloc_put(ip); + err = gfs2_unstuff_dinode(ip, NULL); + if (err) + return err; + } + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = q.qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = q.qu_limit; + } + if (fdq->d_fieldmask & FS_DQ_BCOUNT) { + q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = q.qu_value; + } } - page = grab_cache_page(mapping, index); + + /* Write the quota into the quota file on disk */ + ptr = &q; + nbytes = sizeof(struct gfs2_quota); +get_a_page: + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) return -ENOMEM; @@ -647,35 +736,52 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (!buffer_mapped(bh)) { gfs2_block_map(inode, iblock, bh, 1); if (!buffer_mapped(bh)) - goto unlock; + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); } if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) - goto unlock; + goto unlock_out; } - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); - kaddr = kmap_atomic(page, KM_USER0); - ptr = kaddr + offset; - gfs2_quota_in(&qp, ptr); - qp.qu_value += change; - value = qp.qu_value; - gfs2_quota_out(&qp, ptr); + kaddr = kmap_atomic(page); + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - err = 0; - qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); - qd->qd_qb.qb_value = cpu_to_be64(value); - ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_magic = cpu_to_be32(GFS2_MAGIC); - ((struct gfs2_quota_lvb*)(qd->qd_gl->gl_lvb))->qb_value = cpu_to_be64(value); -unlock: + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode->i_atime = CURRENT_TIME; + mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); + return 0; + +unlock_out: unlock_page(page); page_cache_release(page); return err; @@ -685,26 +791,31 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; struct gfs2_quota_data *qd; + unsigned reserved; loff_t offset; - unsigned int nalloc = 0; - struct gfs2_alloc *al = NULL; + unsigned int nalloc = 0, blocks; int error; + error = gfs2_rs_alloc(ip); + if (error) + return error; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL); + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + mutex_lock(&ip->i_inode.i_mutex); for (qx = 0; qx < num_qd; qx++) { - error = gfs2_glock_nq_init(qda[qx]->qd_gl, - LM_ST_EXCLUSIVE, + error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); if (error) goto out; @@ -715,53 +826,45 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out; for (x = 0; x < num_qd; x++) { - int alloc_required; - offset = qd2offset(qda[x]); - error = gfs2_write_alloc_required(ip, offset, - sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_gunlock; - if (alloc_required) + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) nalloc++; } - if (nalloc) { - al = gfs2_alloc_get(ip); + /* + * 1 blk for unstuffing inode if stuffed. We add this extra + * block to the reservation unconditionally. If the inode + * doesn't need unstuffing, the block will be released to the + * rgrp since it won't be allocated during the transaction + */ + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; + + reserved = 1 + (nalloc * (data_blocks + ind_blocks)); + ap.target = reserved; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_alloc; - al->al_requested = nalloc * (data_blocks + ind_blocks); + if (nalloc) + blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; - error = gfs2_inplace_reserve(ip); - if (error) - goto out_alloc; - - error = gfs2_trans_begin(sdp, - al->al_rgd->rd_length + - num_qd * data_blocks + - nalloc * ind_blocks + - RES_DINODE + num_qd + - RES_STATFS, 0); - if (error) - goto out_ipres; - } else { - error = gfs2_trans_begin(sdp, - num_qd * data_blocks + - RES_DINODE + num_qd, 0); - if (error) - goto out_gunlock; - } + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto out_ipres; for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); - error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, - (struct gfs2_quota_data *) - qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); if (error) goto out_end_trans; do_qc(qd, -qd->qd_change_sync); + set_bit(QDF_REFRESH, &qd->qd_flags); } error = 0; @@ -769,45 +872,62 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (nalloc) - gfs2_inplace_release(ip); + gfs2_inplace_release(ip); out_alloc: - if (nalloc) - gfs2_alloc_put(ip); -out_gunlock: gfs2_glock_dq_uninit(&i_gh); out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); + mutex_unlock(&ip->i_inode.i_mutex); kfree(ghs); - gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH); return error; } +static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota q; + struct gfs2_quota_lvb *qlvb; + loff_t pos; + int error; + + memset(&q, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); + if (error < 0) + return error; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; + qd->qd_qb = *qlvb; + + return 0; +} + static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; - struct gfs2_quota_host q; - char buf[sizeof(struct gfs2_quota)]; int error; - struct gfs2_quota_lvb *qlvb; restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) return error; - qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { - loff_t pos; gfs2_glock_dq_uninit(q_gh); - error = gfs2_glock_nq_init(qd->qd_gl, - LM_ST_EXCLUSIVE, GL_NOCACHE, - q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, q_gh); if (error) return error; @@ -815,30 +935,14 @@ restart: if (error) goto fail; - memset(buf, 0, sizeof(struct gfs2_quota)); - pos = qd2offset(qd); - error = gfs2_internal_read(ip, NULL, buf, &pos, - sizeof(struct gfs2_quota)); - if (error < 0) + error = update_qd(sdp, qd); + if (error) goto fail_gunlock; gfs2_glock_dq_uninit(&i_gh); - - - gfs2_quota_in(&q, buf); - qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; - qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); - qlvb->__pad = 0; - qlvb->qb_limit = cpu_to_be64(q.qu_limit); - qlvb->qb_warn = cpu_to_be64(q.qu_warn); - qlvb->qb_value = cpu_to_be64(q.qu_value); - qd->qd_qb = *qlvb; - - if (gfs2_glock_is_blocking(qd->qd_gl)) { - gfs2_glock_dq_uninit(q_gh); - force_refresh = 0; - goto restart; - } + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; } return 0; @@ -850,24 +954,30 @@ fail: return error; } -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; unsigned int x; int error = 0; - gfs2_quota_hold(ip, uid, gid); + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), - sort_qd, NULL); + sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num, + sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < al->al_qd_num; x++) { - error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + int force = NO_FORCE; + qd = ip->i_res->rs_qa_qd[x]; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force = FORCE; + error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]); if (error) break; } @@ -876,7 +986,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -894,9 +1004,9 @@ static int need_sync(struct gfs2_quota_data *qd) if (!qd->qd_qb.qb_limit) return 0; - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lock); value = qd->qd_change; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lock); spin_lock(>->gt_spin); num = gt->gt_quota_scale_num; @@ -910,7 +1020,7 @@ static int need_sync(struct gfs2_quota_data *qd) do_sync = 0; else { value *= gfs2_jindex_size(sdp) * num; - do_div(value, den); + value = div_s64(value, den); value += (s64)be64_to_cpu(qd->qd_qb.qb_value); if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) do_sync = 0; @@ -921,25 +1031,42 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; + int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < al->al_qd_num; x++) { + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = al->al_qd[x]; + qd = ip->i_res->rs_qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + if (!sync) + continue; + + spin_lock(&qd_lock); + found = qd_check_sync(sdp, qd, NULL); + spin_unlock(&qd_lock); + + if (!found) + continue; - if (sync && qd_trylock(qd)) - qda[count++] = qd; + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + continue; + } + + qda[count++] = qd; } if (count) { @@ -958,18 +1085,17 @@ static int print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n", - sdp->sd_fsname, type, - (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", - qd->qd_id); + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data *qd; s64 value; unsigned int x; @@ -981,20 +1107,23 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; - if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); - spin_lock(&sdp->sd_quota_spin); + spin_lock(&qd_lock); value += qd->qd_change; - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lock); if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); + error = -EDQUOT; break; } else if (be64_to_cpu(qd->qd_qb.qb_warn) && @@ -1002,6 +1131,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; } @@ -1011,41 +1142,42 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) } void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid) + kuid_t uid, kgid_t gid) { - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_quota_data *qd; unsigned int x; if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) return; - if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) + if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + qd = ip->i_res->rs_qa_qd[x]; - if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || - (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid))) { do_qc(qd, change); } } } -int gfs2_quota_sync(struct gfs2_sbd *sdp) +int gfs2_quota_sync(struct super_block *sb, int type) { + struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; - sdp->sd_quota_sync_gen++; - qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; + mutex_lock(&sdp->sd_quota_sync_mutex); + sdp->sd_quota_sync_gen++; + do { num_qd = 0; @@ -1070,18 +1202,19 @@ int gfs2_quota_sync(struct gfs2_sbd *sdp) } } while (!error && num_qd == max_qd); + mutex_unlock(&sdp->sd_quota_sync_mutex); kfree(qda); return error; } -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; - error = qd_get(sdp, user, id, CREATE, &qd); + error = qd_get(sdp, qid, &qd); if (error) return error; @@ -1090,52 +1223,39 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) gfs2_glock_dq_uninit(&q_gh); qd_put(qd); - return error; } -static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = be32_to_cpu(str->qc_id); -} - int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); - unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; u64 dblock; u32 extlen = 0; int error; - if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || - ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { - gfs2_consist_inode(ip); + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) return -EIO; - } - sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; - sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); error = -ENOMEM; - - sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_KERNEL); + sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | + __GFP_ZERO, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!sdp->sd_quota_bitmap[x]) - goto fail; - } - for (x = 0; x < blocks; x++) { struct buffer_head *bh; + const struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { @@ -1153,35 +1273,41 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) goto fail; } + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { - struct gfs2_quota_change_host qc; struct gfs2_quota_data *qd; - - gfs2_quota_change_in(&qc, bh->b_data + - sizeof(struct gfs2_meta_header) + - y * sizeof(struct gfs2_quota_change)); - if (!qc.qc_change) + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) continue; - error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), - qc.qc_id, &qd); - if (error) { + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { brelse(bh); goto fail; } set_bit(QDF_CHANGE, &qd->qd_flags); - qd->qd_change = qc.qc_change; + qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; - qd->qd_last_touched = jiffies; - spin_lock(&sdp->sd_quota_spin); - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + spin_lock(&qd_lock); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); - spin_unlock(&sdp->sd_quota_spin); + spin_unlock(&qd_lock); + + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); found++; } @@ -1201,80 +1327,333 @@ fail: return error; } -void gfs2_quota_scan(struct gfs2_sbd *sdp) +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { - struct gfs2_quota_data *qd, *safe; - LIST_HEAD(dead); - - spin_lock(&sdp->sd_quota_spin); - list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) { - if (!qd->qd_count && - time_after_eq(jiffies, qd->qd_last_touched + - gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) { - list_move(&qd->qd_list, &dead); - gfs2_assert_warn(sdp, - atomic_read(&sdp->sd_quota_count) > 0); - atomic_dec(&sdp->sd_quota_count); - } - } - spin_unlock(&sdp->sd_quota_spin); + struct list_head *head = &sdp->sd_quota_list; + struct gfs2_quota_data *qd; + + spin_lock(&qd_lock); + while (!list_empty(head)) { + qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - while (!list_empty(&dead)) { - qd = list_entry(dead.next, struct gfs2_quota_data, qd_list); list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + atomic_dec(&sdp->sd_quota_count); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); - gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); + gfs2_glock_put(qd->qd_gl); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); + + spin_lock(&qd_lock); + } + spin_unlock(&qd_lock); + + gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + + if (sdp->sd_quota_bitmap) { + if (is_vmalloc_addr(sdp->sd_quota_bitmap)) + vfree(sdp->sd_quota_bitmap); + else + kfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } } -void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { - struct list_head *head = &sdp->sd_quota_list; - struct gfs2_quota_data *qd; - unsigned int x; + if (error == 0 || error == -EROFS) + return; + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); +} - spin_lock(&sdp->sd_quota_spin); - while (!list_empty(head)) { - qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct super_block *sb, int type), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp->sd_vfs, 0); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} - if (qd->qd_count > 1 || - (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - list_move(&qd->qd_list, head); - spin_unlock(&sdp->sd_quota_spin); - schedule(); - spin_lock(&sdp->sd_quota_spin); - continue; +static void quotad_check_trunc_list(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + while(1) { + ip = NULL; + spin_lock(&sdp->sd_trunc_lock); + if (!list_empty(&sdp->sd_trunc_list)) { + ip = list_entry(sdp->sd_trunc_list.next, + struct gfs2_inode, i_trunc_list); + list_del_init(&ip->i_trunc_list); } + spin_unlock(&sdp->sd_trunc_lock); + if (ip == NULL) + return; + gfs2_glock_finish_truncate(ip); + } +} - list_del(&qd->qd_list); - atomic_dec(&sdp->sd_quota_count); - spin_unlock(&sdp->sd_quota_spin); +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { + if (!sdp->sd_statfs_force_sync) { + sdp->sd_statfs_force_sync = 1; + wake_up(&sdp->sd_quota_wait); + } +} - if (!qd->qd_count) { - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - } else - gfs2_assert_warn(sdp, qd->qd_slot_count == 1); - gfs2_assert_warn(sdp, !qd->qd_bh_count); - gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + DEFINE_WAIT(wait); + int empty; + + while (!kthread_should_stop()) { + + /* Update the master statfs file */ + if (sdp->sd_statfs_force_sync) { + int error = gfs2_statfs_sync(sdp->sd_vfs, 0); + quotad_error(sdp, "statfs", error); + statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + } + else + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, + &tune->gt_statfs_quantum); - spin_lock(&sdp->sd_quota_spin); + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + "ad_timeo, &tune->gt_quota_quantum); + + /* Check for & recover partially truncated inodes */ + quotad_check_trunc_list(sdp); + + try_to_freeze(); + + t = min(quotad_timeo, statfs_timeo); + + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); + spin_lock(&sdp->sd_trunc_lock); + empty = list_empty(&sdp->sd_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + if (empty && !sdp->sd_statfs_force_sync) + t -= schedule_timeout(t); + else + t = 0; + finish_wait(&sdp->sd_quota_wait, &wait); } - spin_unlock(&sdp->sd_quota_spin); - gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + return 0; +} - if (sdp->sd_quota_bitmap) { - for (x = 0; x < sdp->sd_quota_chunks; x++) - kfree(sdp->sd_quota_bitmap[x]); - kfree(sdp->sd_quota_bitmap); +static int gfs2_quota_get_xstate(struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + memset(fqs, 0, sizeof(struct fs_quota_stat)); + fqs->qs_version = FS_QSTAT_VERSION; + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); + break; + case GFS2_QUOTA_OFF: + break; + } + + if (sdp->sd_quota_inode) { + fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; + fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; + } + fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ + fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ + fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru); + return 0; +} + +static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_lvb *qlvb; + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + memset(fdq, 0, sizeof(struct fs_disk_quota)); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + error = do_glock(qd, FORCE, &q_gh); + if (error) + goto out; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + fdq->d_version = FS_DQUOT_VERSION; + fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; + fdq->d_id = from_kqid_munged(current_user_ns(), qid); + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; + + gfs2_glock_dq_uninit(&q_gh); +out: + qd_put(qd); + return error; +} + +/* GFS2 only supports a subset of the XFS fields */ +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) + +static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, + struct fs_disk_quota *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh, i_gh; + unsigned int data_blocks, ind_blocks; + unsigned int blocks = 0; + int alloc_required; + loff_t offset; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + if (fdq->d_fieldmask & ~GFS2_FIELDMASK) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + + error = gfs2_rs_alloc(ip); + if (error) + goto out_put; + + mutex_lock(&ip->i_inode.i_mutex); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); + if (error) + goto out_unlockput; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_q; + + /* Check for existing entry, if none then alloc new blocks */ + error = update_qd(sdp, qd); + if (error) + goto out_i; + + /* If nothing has changed, this is a no-op */ + if ((fdq->d_fieldmask & FS_DQ_BSOFT) && + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= FS_DQ_BSOFT; + + if ((fdq->d_fieldmask & FS_DQ_BHARD) && + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= FS_DQ_BHARD; + + if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && + ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= FS_DQ_BCOUNT; + + if (fdq->d_fieldmask == 0) + goto out_i; + + offset = qd2offset(qd); + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; + if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + blocks = 1 + data_blocks + ind_blocks; + ap.target = blocks; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_i; + blocks += gfs2_rg_blocks(ip, blocks); } + + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); + if (error) + goto out_release; + + /* Apply changes */ + error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + + gfs2_trans_end(sdp); +out_release: + if (alloc_required) + gfs2_inplace_release(ip); +out_i: + gfs2_glock_dq_uninit(&i_gh); +out_q: + gfs2_glock_dq_uninit(&q_gh); +out_unlockput: + mutex_unlock(&ip->i_inode.i_mutex); +out_put: + qd_put(qd); + return error; } +const struct quotactl_ops gfs2_quotactl_ops = { + .quota_sync = gfs2_quota_sync, + .get_xstate = gfs2_quota_get_xstate, + .get_dqblk = gfs2_get_dqblk, + .set_dqblk = gfs2_set_dqblk, +}; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index a8be1417051..55d506eb3c4 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -10,26 +10,53 @@ #ifndef __QUOTA_DOT_H__ #define __QUOTA_DOT_H__ +#include <linux/list_lru.h> + struct gfs2_inode; struct gfs2_sbd; -#define NO_QUOTA_CHANGE ((u32)-1) - -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unhold(struct gfs2_inode *ip); - -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unlock(struct gfs2_inode *ip); - -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid); - -int gfs2_quota_sync(struct gfs2_sbd *sdp); -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); - -int gfs2_quota_init(struct gfs2_sbd *sdp); -void gfs2_quota_scan(struct gfs2_sbd *sdp); -void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID + +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); + +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); + +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid); + +extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); + +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); + +extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); + +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int ret; + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + return ret; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + if (ret) + gfs2_quota_unlock(ip); + return ret; +} + +extern const struct quotactl_ops gfs2_quotactl_ops; +extern struct shrinker gfs2_qd_shrinker; +extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 6fb07d67ca8..94555d4c569 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -7,20 +7,19 @@ * of the GNU General Public License version 2. */ +#include <linux/module.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/lm_interface.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" -#include "lm.h" #include "lops.h" #include "meta_io.h" #include "recovery.h" @@ -28,6 +27,8 @@ #include "util.h" #include "dir.h" +struct workqueue_struct *gfs_recovery_wq; + int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { @@ -51,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, return error; } -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; int found = 0; @@ -69,7 +70,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) return 0; } - rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL); + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); if (!rr) return -ENOMEM; @@ -80,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) return 1; } -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct gfs2_revoke_replay *rr; int wrap, a, b, revoke; int found = 0; - list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) { if (rr->rr_blkno == blkno) { found = 1; break; @@ -96,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) if (!found) return 0; - wrap = (rr->rr_where < sdp->sd_replay_tail); - a = (sdp->sd_replay_tail < where); + wrap = (rr->rr_where < jd->jd_replay_tail); + a = (jd->jd_replay_tail < where); b = (where < rr->rr_where); revoke = (wrap) ? (a || b) : (a && b); return revoke; } -void gfs2_revoke_clean(struct gfs2_sbd *sdp) +void gfs2_revoke_clean(struct gfs2_jdesc *jd) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; while (!list_empty(head)) { @@ -150,7 +151,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { struct buffer_head *bh; - struct gfs2_log_header_host lh; + struct gfs2_log_header_host uninitialized_var(lh); const u32 nothing = 0; u32 hash; int error; @@ -410,7 +411,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea memset(lh, 0, sizeof(struct gfs2_log_header)); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); lh->lh_blkno = cpu_to_be32(lblock); @@ -425,31 +428,44 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea return error; } -/** - * gfs2_recover_journal - recovery a given journal - * @jd: the struct gfs2_jdesc describing the journal - * - * Acquire the journal's lock, check to see if the journal is clean, and - * do recovery if necessary. - * - * Returns: errno - */ -int gfs2_recover_journal(struct gfs2_jdesc *jd) +static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) { + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + ls->ls_recover_jid_done = jid; + ls->ls_recover_jid_status = message; + sprintf(env_jid, "JID=%d", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); +} + +void gfs2_recover_func(struct work_struct *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; - struct gfs2_holder j_gh, ji_gh, t_gh; + struct gfs2_holder j_gh, ji_gh, thaw_gh; unsigned long t; int ro = 0; unsigned int pass; int error; + int jlocked = 0; - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (sdp->sd_args.ar_spectator || + (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid); - + jlocked = 1; /* Acquire the journal lock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, @@ -492,15 +508,17 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) t = jiffies; - /* Acquire a shared hold on the transaction lock */ + /* Acquire a shared hold on the freeze lock */ - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_NOCANCEL | GL_NOCACHE, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY, + &thaw_gh); if (error) goto fail_gunlock_ji; - if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { @@ -520,7 +538,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); @@ -531,63 +549,69 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) head.lh_blkno, pass); lops_after_scan(jd, error, pass); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } error = clean_journal(jd, &head); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); t = DIV_ROUND_UP(jiffies - t, HZ); fs_info(sdp, "jid=%u: Journal replayed in %lus\n", jd->jd_jid, t); } - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) - gfs2_glock_dq_uninit(&ji_gh); + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); - gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); - - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); gfs2_glock_dq_uninit(&j_gh); + } fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return 0; + goto done; -fail_gunlock_tr: - gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_thaw: + gfs2_glock_dq_uninit(&thaw_gh); fail_gunlock_ji: - if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); fail_gunlock_j: gfs2_glock_dq_uninit(&j_gh); } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); - fail: - gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); - return error; + jd->jd_recover_error = error; + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_atomic(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -/** - * gfs2_check_journals - Recover any dirty journals - * @sdp: the filesystem - * - */ +static int gfs2_recovery_wait(void *word) +{ + schedule(); + return 0; +} -void gfs2_check_journals(struct gfs2_sbd *sdp) +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { - struct gfs2_jdesc *jd; + int rv; - for (;;) { - jd = gfs2_jdesc_find_dirty(sdp); - if (!jd) - break; + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; - if (jd != sdp->sd_jdesc) - gfs2_recover_journal(jd); - } + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + TASK_UNINTERRUPTIBLE); + + return wait ? jd->jd_recover_error : 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index f7235e61c72..6142836cce9 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -12,23 +12,25 @@ #include "incore.h" +extern struct workqueue_struct *gfs_recovery_wq; + static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) { if (++*blk == sdp->sd_jdesc->jd_blocks) *blk = 0; } -int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); -int gfs2_find_jhead(struct gfs2_jdesc *jd, +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -void gfs2_check_journals(struct gfs2_sbd *sdp); +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3552110b2e5..f4cb9c0d6bb 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1,19 +1,24 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/prefetch.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/random.h> #include "gfs2.h" #include "incore.h" @@ -28,11 +33,21 @@ #include "util.h" #include "log.h" #include "inode.h" -#include "ops_address.h" +#include "trace_gfs2.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) +#if BITS_PER_LONG == 32 +#define LBITMASK (0x55555555UL) +#define LBITSKIP55 (0x55555555UL) +#define LBITSKIP00 (0x00000000UL) +#else +#define LBITMASK (0x5555555555555555UL) +#define LBITSKIP55 (0x5555555555555555UL) +#define LBITSKIP00 (0x0000000000000000UL) +#endif + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two @@ -44,6 +59,11 @@ * 3 = Used (metadata) */ +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + static const char valid_change[16] = { /* current */ /* n */ 0, 1, 1, 1, @@ -52,126 +72,338 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); + /** * gfs2_setbit - Set a bit in the bitmaps - * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer - * @block: the block to set + * @rbm: The position of the bit to set + * @do_clone: Also set the clone bitmap, if it exists * @new_state: the new state of the block * */ -static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, u32 block, - unsigned char new_state) +static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, + unsigned char new_state) { - unsigned char *byte, *end, cur_state; - unsigned int bit; - - byte = buffer + (block / GFS2_NBBY); - bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - end = buffer + buflen; - - gfs2_assert(rgd->rd_sbd, byte < end); - - cur_state = (*byte >> bit) & GFS2_BIT_MASK; + unsigned char *byte1, *byte2, *end, cur_state; + struct gfs2_bitmap *bi = rbm_bi(rbm); + unsigned int buflen = bi->bi_len; + const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; + + BUG_ON(byte1 >= end); + + cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; + + if (unlikely(!valid_change[new_state * 4 + cur_state])) { + pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n", + rbm->offset, cur_state, new_state); + pr_warn("rgrp=0x%llx bi_start=0x%x\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); + pr_warn("bi_offset=0x%x bi_len=0x%x\n", + bi->bi_offset, bi->bi_len); + dump_stack(); + gfs2_consist_rgrpd(rbm->rgd); + return; + } + *byte1 ^= (cur_state ^ new_state) << bit; - if (valid_change[new_state * 4 + cur_state]) { - *byte ^= cur_state << bit; - *byte |= new_state << bit; - } else - gfs2_consist_rgrpd(rgd); + if (do_clone && bi->bi_clone) { + byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY); + cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; + *byte2 ^= (cur_state ^ new_state) << bit; + } } /** * gfs2_testbit - test a bit in the bitmaps - * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer - * @block: the block to read + * @rbm: The bit to test * + * Returns: The two bit block state of the requested bit */ -static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, u32 block) +static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) { - unsigned char *byte, *end, cur_state; + struct gfs2_bitmap *bi = rbm_bi(rbm); + const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset; + const u8 *byte; unsigned int bit; - byte = buffer + (block / GFS2_NBBY); - bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - end = buffer + buflen; + byte = buffer + (rbm->offset / GFS2_NBBY); + bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; - gfs2_assert(rgd->rd_sbd, byte < end); + return (*byte >> bit) & GFS2_BIT_MASK; +} - cur_state = (*byte >> bit) & GFS2_BIT_MASK; +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) +{ + u64 tmp; + static const u64 search[] = { + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + +/** + * rs_cmp - multi-block reservation range compare + * @blk: absolute file system block number of the new reservation + * @len: number of blocks in the new reservation + * @rs: existing reservation to compare against + * + * returns: 1 if the block range is beyond the reach of the reservation + * -1 if the block range is before the start of the reservation + * 0 if the block range overlaps with the reservation + */ +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) +{ + u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm); - return cur_state; + if (blk >= startblk + rs->rs_free) + return 1; + if (blk + len - 1 < startblk) + return -1; + return 0; } /** * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing * a block in a given allocation state. - * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer + * @buf: the buffer that holds the bitmaps + * @len: the length (in bytes) of the buffer * @goal: start search at this block's bit-pair (within @buffer) - * @old_state: GFS2_BLKST_XXX the state of the block we're looking for. + * @state: GFS2_BLKST_XXX the state of the block we're looking for. * * Scope of @goal and returned block number is only within this bitmap buffer, * not entire rgrp or filesystem. @buffer will be offset from the actual - * beginning of a bitmap block buffer, skipping any header structures. + * beginning of a bitmap block buffer, skipping any header structures, but + * headers are always a multiple of 64 bits long so that the buffer is + * always aligned to a 64 bit boundary. + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to read a complete multiple of 64 bits at the end + * of the block in case the end is no aligned to a natural boundary. * * Return: the block number (bitmap buffer scope) that was found */ -static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, - unsigned char old_state) +static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, + u32 goal, u8 state) { - unsigned char *byte; - u32 blk = goal; - unsigned int bit, bitlong; - unsigned long *plong, plong55; + u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); + const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); + const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); + u64 tmp; + u64 mask = 0x5555555555555555ULL; + u32 bit; + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); + ptr++; + } + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(u64) - 1))) + tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = __ffs64(tmp); + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; +} - byte = buffer + (goal / GFS2_NBBY); - plong = (unsigned long *)(buffer + (goal / GFS2_NBBY)); - bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; - bitlong = bit; -#if BITS_PER_LONG == 32 - plong55 = 0x55555555; -#else - plong55 = 0x5555555555555555; -#endif - while (byte < buffer + buflen) { +/** + * gfs2_rbm_from_block - Set the rbm based upon rgd and block number + * @rbm: The rbm with rgd already set correctly + * @block: The block number (filesystem relative) + * + * This sets the bi and offset members of an rbm based on a + * resource group and a filesystem relative block number. The + * resource group must be set in the rbm on entry, the bi and + * offset members will be set by this function. + * + * Returns: 0 on success, or an error code + */ - if (bitlong == 0 && old_state == 0 && *plong == plong55) { - plong++; - byte += sizeof(unsigned long); - blk += sizeof(unsigned long) * GFS2_NBBY; - continue; - } - if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) - return blk; - bit += GFS2_BIT_SIZE; - if (bit >= 8) { - bit = 0; - byte++; +static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) +{ + u64 rblock = block - rbm->rgd->rd_data0; + + if (WARN_ON_ONCE(rblock > UINT_MAX)) + return -EINVAL; + if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) + return -E2BIG; + + rbm->bii = 0; + rbm->offset = (u32)(rblock); + /* Check if the block is within the first block */ + if (rbm->offset < rbm_bi(rbm)->bi_blocks) + return 0; + + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + return 0; +} + +/** + * gfs2_rbm_incr - increment an rbm structure + * @rbm: The rbm with rgd already set correctly + * + * This function takes an existing rbm structure and increments it to the next + * viable block offset. + * + * Returns: If incrementing the offset would cause the rbm to go past the + * end of the rgrp, true is returned, otherwise false. + * + */ + +static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +{ + if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ + rbm->offset++; + return false; + } + if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ + return true; + + rbm->offset = 0; + rbm->bii++; + return false; +} + +/** + * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned + * @rbm: Position to search (value/result) + * @n_unaligned: Number of unaligned blocks to check + * @len: Decremented for each block found (terminate on zero) + * + * Returns: true if a non-free block is encountered + */ + +static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) +{ + u32 n; + u8 res; + + for (n = 0; n < n_unaligned; n++) { + res = gfs2_testbit(rbm); + if (res != GFS2_BLKST_FREE) + return true; + (*len)--; + if (*len == 0) + return true; + if (gfs2_rbm_incr(rbm)) + return true; + } + + return false; +} + +/** + * gfs2_free_extlen - Return extent length of free blocks + * @rrbm: Starting position + * @len: Max length to check + * + * Starting at the block specified by the rbm, see how many free blocks + * there are, not reading more than len blocks ahead. This can be done + * using memchr_inv when the blocks are byte aligned, but has to be done + * on a block by block basis in case of unaligned blocks. Also this + * function can cope with bitmap boundaries (although it must stop on + * a resource group boundary) + * + * Returns: Number of free blocks in the extent + */ + +static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) +{ + struct gfs2_rbm rbm = *rrbm; + u32 n_unaligned = rbm.offset & 3; + u32 size = len; + u32 bytes; + u32 chunk_size; + u8 *ptr, *start, *end; + u64 block; + struct gfs2_bitmap *bi; + + if (n_unaligned && + gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) + goto out; + + n_unaligned = len & 3; + /* Start is now byte aligned */ + while (len > 3) { + bi = rbm_bi(&rbm); + start = bi->bi_bh->b_data; + if (bi->bi_clone) + start = bi->bi_clone; + end = start + bi->bi_bh->b_size; + start += bi->bi_offset; + BUG_ON(rbm.offset & 3); + start += (rbm.offset / GFS2_NBBY); + bytes = min_t(u32, len / GFS2_NBBY, (end - start)); + ptr = memchr_inv(start, 0, bytes); + chunk_size = ((ptr == NULL) ? bytes : (ptr - start)); + chunk_size *= GFS2_NBBY; + BUG_ON(len < chunk_size); + len -= chunk_size; + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; + break; } - bitlong += GFS2_BIT_SIZE; - if (bitlong >= sizeof(unsigned long) * 8) { - bitlong = 0; - plong++; + if (ptr) { + n_unaligned = 3; + break; } - - blk++; + n_unaligned = len & 3; } - return BFITNOENT; + /* Deal with any bits left over at the end */ + if (n_unaligned) + gfs2_unaligned_extlen(&rbm, n_unaligned, &len); +out: + return size - len; } /** * gfs2_bitcount - count the number of bits in a certain state + * @rgd: the resource group descriptor * @buffer: the buffer that holds the bitmaps * @buflen: the length (in bytes) of the buffer * @state: the state of the block we're looking for @@ -179,14 +411,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, * Returns: The number of bits */ -static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, - unsigned int buflen, unsigned char state) +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, + unsigned int buflen, u8 state) { - unsigned char *byte = buffer; - unsigned char *end = buffer + buflen; - unsigned char state1 = state << 2; - unsigned char state2 = state << 4; - unsigned char state3 = state << 6; + const u8 *byte = buffer; + const u8 *end = buffer + buflen; + const u8 state1 = state << 2; + const u8 state2 = state << 4; + const u8 state3 = state << 6; u32 count = 0; for (; byte < end; byte++) { @@ -205,7 +437,6 @@ static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, /** * gfs2_rgrp_verify - Verify that a resource group is consistent - * @sdp: the filesystem * @rgd: the rgrp * */ @@ -230,37 +461,27 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) bi->bi_len, x); } - if (count[0] != rgd->rd_rg.rg_free) { + if (count[0] != rgd->rd_free) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "free data mismatch: %u != %u\n", - count[0], rgd->rd_rg.rg_free); + count[0], rgd->rd_free); return; } - tmp = rgd->rd_data - - rgd->rd_rg.rg_free - - rgd->rd_rg.rg_dinodes; - if (count[1] + count[2] != tmp) { + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; + if (count[1] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", count[1], tmp); return; } - if (count[3] != rgd->rd_rg.rg_dinodes) { + if (count[2] + count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_rg.rg_dinodes); - return; - } - - if (count[2] > count[3]) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "unlinked inodes > inodes: %u\n", - count[2]); + count[2] + count[3], rgd->rd_dinodes); return; } - } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -273,25 +494,38 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) /** * gfs2_blk2rgrpd - Find resource group for a given data/meta block number * @sdp: The GFS2 superblock - * @n: The data block number + * @blk: The data block number + * @exact: True if this needs to be an exact match * * Returns: The resource group, or NULL if not found */ -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) { - struct gfs2_rgrpd *rgd; + struct rb_node *n, *next; + struct gfs2_rgrpd *cur; spin_lock(&sdp->sd_rindex_spin); - - list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { - if (rgrp_contains_block(rgd, blk)) { - list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + n = sdp->sd_rindex_tree.rb_node; + while (n) { + cur = rb_entry(n, struct gfs2_rgrpd, rd_node); + next = NULL; + if (blk < cur->rd_addr) + next = n->rb_left; + else if (blk >= cur->rd_data0 + cur->rd_data) + next = n->rb_right; + if (next == NULL) { spin_unlock(&sdp->sd_rindex_spin); - return rgd; + if (exact) { + if (blk < cur->rd_addr) + return NULL; + if (blk >= cur->rd_data0 + cur->rd_data) + return NULL; + } + return cur; } + n = next; } - spin_unlock(&sdp->sd_rindex_spin); return NULL; @@ -306,71 +540,209 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) { - gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); - return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; } /** * gfs2_rgrpd_get_next - get the next RG - * @rgd: A RG + * @rgd: the resource group descriptor * * Returns: The next rgrp */ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) { - if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); return NULL; - return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; } -static void clear_rgrpdi(struct gfs2_sbd *sdp) +void gfs2_free_clones(struct gfs2_rgrpd *rgd) +{ + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } +} + +/** + * gfs2_rs_alloc - make sure we have a reservation assigned to the inode + * @ip: the inode for this reservation + */ +int gfs2_rs_alloc(struct gfs2_inode *ip) +{ + int error = 0; + + down_write(&ip->i_rw_mutex); + if (ip->i_res) + goto out; + + ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!ip->i_res) { + error = -ENOMEM; + goto out; + } + + RB_CLEAR_NODE(&ip->i_res->rs_node); +out: + up_write(&ip->i_rw_mutex); + return error; +} + +static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) +{ + gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n", + (unsigned long long)rs->rs_inum, + (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), + rs->rs_rbm.offset, rs->rs_free); +} + +/** + * __rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +static void __rs_deltree(struct gfs2_blkreserv *rs) { - struct list_head *head; struct gfs2_rgrpd *rgd; - struct gfs2_glock *gl; - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = NULL; - head = &sdp->sd_rindex_recent_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); - list_del(&rgd->rd_recent); + if (!gfs2_rs_active(rs)) + return; + + rgd = rs->rs_rbm.rgd; + trace_gfs2_rs(rs, TRACE_RS_TREEDEL); + rb_erase(&rs->rs_node, &rgd->rd_rstree); + RB_CLEAR_NODE(&rs->rs_node); + + if (rs->rs_free) { + struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm); + + /* return reserved blocks to the rgrp */ + BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); + rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; + rs->rs_free = 0; + clear_bit(GBF_FULL, &bi->bi_flags); } - spin_unlock(&sdp->sd_rindex_spin); +} - head = &sdp->sd_rindex_list; - while (!list_empty(head)) { - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); +/** + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + rgd = rs->rs_rbm.rgd; + if (rgd) { + spin_lock(&rgd->rd_rsspin); + __rs_deltree(rs); + spin_unlock(&rgd->rd_rsspin); + } +} + +/** + * gfs2_rs_delete - delete a multi-block reservation + * @ip: The inode for this reservation + * @wcount: The inode's write count, or NULL + * + */ +void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +{ + down_write(&ip->i_rw_mutex); + if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { + gfs2_rs_deltree(ip->i_res); + BUG_ON(ip->i_res->rs_free); + kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); + ip->i_res = NULL; + } + up_write(&ip->i_rw_mutex); +} + +/** + * return_all_reservations - return all reserved blocks back to the rgrp. + * @rgd: the rgrp that needs its space back + * + * We previously reserved a bunch of blocks for allocation. Now we need to + * give them back. This leave the reservation structures in tact, but removes + * all of their corresponding "no-fly zones". + */ +static void return_all_reservations(struct gfs2_rgrpd *rgd) +{ + struct rb_node *n; + struct gfs2_blkreserv *rs; + + spin_lock(&rgd->rd_rsspin); + while ((n = rb_first(&rgd->rd_rstree))) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + __rs_deltree(rs); + } + spin_unlock(&rgd->rd_rsspin); +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + struct rb_node *n; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); gl = rgd->rd_gl; - list_del(&rgd->rd_list); - list_del(&rgd->rd_list_mru); + rb_erase(n, &sdp->sd_rindex_tree); if (gl) { + spin_lock(&gl->gl_spin); gl->gl_object = NULL; + spin_unlock(&gl->gl_spin); + gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } + gfs2_free_clones(rgd); kfree(rgd->rd_bits); - kfree(rgd); + return_all_reservations(rgd); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_rindex_mutex); - clear_rgrpdi(sdp); - mutex_unlock(&sdp->sd_rindex_mutex); -} - static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { - printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); - printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); - printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); - printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); - printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); + pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + pr_info("ri_length = %u\n", rgd->rd_length); + pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + pr_info("ri_data = %u\n", rgd->rd_data); + pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes); } /** @@ -402,24 +774,28 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; + bi->bi_flags = 0; /* small rgrp; bitmap stored completely in header block */ if (length == 1) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* header block */ } else if (x == 0) { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); bi->bi_offset = sizeof(struct gfs2_rgrp); bi->bi_start = 0; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* last block */ } else if (x + 1 == length) { bytes = bytes_left; bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; /* other blocks */ } else { bytes = sdp->sd_sb.sb_bsize - @@ -427,6 +803,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) bi->bi_offset = sizeof(struct gfs2_meta_header); bi->bi_start = rgd->rd_bitbytes - bytes_left; bi->bi_len = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; } bytes_left -= bytes; @@ -451,6 +828,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) /** * gfs2_ri_total - Total up the file system space, according to the rindex. + * @sdp: the filesystem * */ u64 gfs2_ri_total(struct gfs2_sbd *sdp) @@ -459,88 +837,113 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) struct inode *inode = sdp->sd_rindex; struct gfs2_inode *ip = GFS2_I(inode); char buf[sizeof(struct gfs2_rindex)]; - struct file_ra_state ra_state; int error, rgrps; - mutex_lock(&sdp->sd_rindex_mutex); - file_ra_state_init(&ra_state, inode->i_mapping); for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) break; - error = gfs2_internal_read(ip, &ra_state, buf, &pos, + error = gfs2_internal_read(ip, buf, &pos, sizeof(struct gfs2_rindex)); if (error != sizeof(struct gfs2_rindex)) break; total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); } - mutex_unlock(&sdp->sd_rindex_mutex); return total_data; } -static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf) +static int rgd_insert(struct gfs2_rgrpd *rgd) { - const struct gfs2_rindex *str = buf; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return -EEXIST; + } - rgd->rd_addr = be64_to_cpu(str->ri_addr); - rgd->rd_length = be32_to_cpu(str->ri_length); - rgd->rd_data0 = be64_to_cpu(str->ri_data0); - rgd->rd_data = be32_to_cpu(str->ri_data); - rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes); + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); + sdp->sd_rgrps++; + return 0; } /** * read_rindex_entry - Pull in a new resource index entry from the disk - * @gl: The glock covering the rindex inode + * @ip: Pointer to the rindex inode * - * Returns: 0 on success, error code otherwise + * Returns: 0 on success, > 0 on EOF, error code otherwise */ -static int read_rindex_entry(struct gfs2_inode *ip, - struct file_ra_state *ra_state) +static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); - char buf[sizeof(struct gfs2_rindex)]; + struct gfs2_rindex buf; int error; struct gfs2_rgrpd *rgd; - error = gfs2_internal_read(ip, ra_state, buf, &pos, + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, (char *)&buf, &pos, sizeof(struct gfs2_rindex)); - if (!error) - return 0; - if (error != sizeof(struct gfs2_rindex)) { - if (error > 0) - error = -EIO; - return error; - } - rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; + + rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); error = -ENOMEM; if (!rgd) return error; - mutex_init(&rgd->rd_mutex); - lops_init_le(&rgd->rd_le, &gfs2_rg_lops); rgd->rd_sbd = sdp; + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); + spin_lock_init(&rgd->rd_rsspin); - list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); - list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); - - gfs2_rindex_in(rgd, buf); error = compute_bitstructs(rgd); if (error) - return error; + goto fail; error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) - return error; + goto fail; rgd->rd_gl->gl_object = rgd; - rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; - rgd->rd_flags |= GFS2_RDF_CHECK; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + error = rgd_insert(rgd); + spin_unlock(&sdp->sd_rindex_spin); + if (!error) + return 0; + + error = 0; /* someone else read in the rgrp; free it and ignore it */ + gfs2_glock_put(rgd->rd_gl); + +fail: + kfree(rgd->rd_bits); + kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -554,69 +957,22 @@ static int read_rindex_entry(struct gfs2_inode *ip, static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; - u64 rgrp_count = ip->i_di.di_size; int error; - if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { - gfs2_consist_inode(ip); - return -EIO; - } - - clear_rgrpdi(sdp); - - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) { - error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } - - sdp->sd_rindex_vn = ip->i_gl->gl_vn; - return 0; -} - -/** - * gfs2_ri_update_special - Pull in a new resource index from the disk - * - * This is a special version that's safe to call from gfs2_inplace_reserve_i. - * In this case we know that we don't have any resource groups in memory yet. - * - * @ip: pointer to the rindex inode - * - * Returns: 0 on successful update, error code otherwise - */ -static int gfs2_ri_update_special(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; - int error; + do { + error = read_rindex_entry(ip); + } while (error == 0); - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { - /* Ignore partials */ - if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - ip->i_di.di_size) - break; - error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } + if (error < 0) + return error; - sdp->sd_rindex_vn = ip->i_gl->gl_vn; + sdp->sd_rindex_uptodate = 1; return 0; } /** - * gfs2_rindex_hold - Grab a lock on the rindex + * gfs2_rindex_update - Update the rindex if required * @sdp: The GFS2 superblock - * @ri_gh: the glock holder * * We grab a lock on the rindex inode to make sure that it doesn't * change whilst we are performing an operation. We keep this lock @@ -628,55 +984,116 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) * special file, which might have been updated if someone expanded the * filesystem (via gfs2_grow utility), which adds new resource groups. * - * Returns: 0 on success, error code otherwise + * Returns: 0 on succeess, error code otherwise */ -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +int gfs2_rindex_update(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); struct gfs2_glock *gl = ip->i_gl; - int error; - - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); - if (error) - return error; + struct gfs2_holder ri_gh; + int error = 0; + int unlock_required = 0; /* Read new copy from disk if we don't have the latest */ - if (sdp->sd_rindex_vn != gl->gl_vn) { - mutex_lock(&sdp->sd_rindex_mutex); - if (sdp->sd_rindex_vn != gl->gl_vn) { - error = gfs2_ri_update(ip); + if (!sdp->sd_rindex_uptodate) { + if (!gfs2_glock_is_locked_by_me(gl)) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); if (error) - gfs2_glock_dq_uninit(ri_gh); + return error; + unlock_required = 1; } - mutex_unlock(&sdp->sd_rindex_mutex); + if (!sdp->sd_rindex_uptodate) + error = gfs2_ri_update(ip); + if (unlock_required) + gfs2_glock_dq_uninit(&ri_gh); } return error; } -static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf) +static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) { const struct gfs2_rgrp *str = buf; - - rg->rg_flags = be32_to_cpu(str->rg_flags); - rg->rg_free = be32_to_cpu(str->rg_free); - rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); - rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); + u32 rg_flags; + + rg_flags = be32_to_cpu(str->rg_flags); + rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); } -static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) +static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; - str->rg_flags = cpu_to_be32(rg->rg_flags); - str->rg_free = cpu_to_be32(rg->rg_free); - str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); str->__pad = cpu_to_be32(0); - str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); } +static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data; + + if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free || + rgl->rl_dinodes != str->rg_dinodes || + rgl->rl_igeneration != str->rg_igeneration) + return 0; + return 1; +} + +static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); + rgl->rl_flags = str->rg_flags; + rgl->rl_free = str->rg_free; + rgl->rl_dinodes = str->rg_dinodes; + rgl->rl_igeneration = str->rg_igeneration; + rgl->__pad = 0UL; +} + +static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change; + rgl->rl_unlinked = cpu_to_be32(unlinked); +} + +static u32 count_unlinked(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + const u32 length = rgd->rd_length; + const u8 *buffer = NULL; + u32 i, goal, count = 0; + + for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) { + goal = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); + while (goal < bi->bi_len * GFS2_NBBY) { + goal = gfs2_bitfit(buffer, bi->bi_len, goal, + GFS2_BLKST_UNLINKED); + if (goal == BFITNOENT) + break; + count++; + goal++; + } + } + + return count; +} + + /** * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps * @rgd: the struct gfs2_rgrpd describing the RG to read in @@ -687,7 +1104,7 @@ static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; @@ -696,16 +1113,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) unsigned int x, y; int error; - mutex_lock(&rgd->rd_mutex); - - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_bh_count) { - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - mutex_unlock(&rgd->rd_mutex); + if (rgd->rd_bits[0].bi_bh != NULL) return 0; - } - spin_unlock(&sdp->sd_rindex_spin); for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; @@ -726,18 +1135,29 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } } - if (rgd->rd_rg_vn != gl->gl_vn) { - gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data); - rgd->rd_rg_vn = gl->gl_vn; + if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + for (x = 0; x < length; x++) + clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; + } + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, + rgd->rd_bits[0].bi_bh->b_data); + } + else if (sdp->sd_args.ar_rgrplvb) { + if (!gfs2_rgrp_lvb_valid(rgd)){ + gfs2_consist_rgrpd(rgd); + error = -EIO; + goto fail; + } + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; } - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); - - mutex_unlock(&rgd->rd_mutex); - return 0; fail: @@ -747,467 +1167,844 @@ fail: bi->bi_bh = NULL; gfs2_assert_warn(sdp, !bi->bi_clone); } - mutex_unlock(&rgd->rd_mutex); return error; } -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) { + u32 rl_flags; + + if (rgd->rd_flags & GFS2_RDF_UPTODATE) + return 0; + + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + return gfs2_rgrp_bh_get(rgd); + + rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); + rl_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgd->rd_free_clone = rgd->rd_free; + rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); + rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); + return 0; +} + +int gfs2_rgrp_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - rgd->rd_bh_count++; - spin_unlock(&sdp->sd_rindex_spin); + if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) + return 0; + return gfs2_rgrp_bh_get(rgd); } /** - * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() - * @rgd: the struct gfs2_rgrpd describing the RG to read in + * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @gh: The glock holder for the resource group * */ -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; - spin_lock(&sdp->sd_rindex_spin); - gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); - if (--rgd->rd_bh_count) { - spin_unlock(&sdp->sd_rindex_spin); - return; - } - for (x = 0; x < length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; - kfree(bi->bi_clone); - bi->bi_clone = NULL; - brelse(bi->bi_bh); - bi->bi_bh = NULL; + if (bi->bi_bh) { + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } } - spin_unlock(&sdp->sd_rindex_spin); } -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) { - struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int length = rgd->rd_length; + struct super_block *sb = sdp->sd_vfs; + u64 blk; + sector_t start = 0; + sector_t nr_blks = 0; + int rv; unsigned int x; - - for (x = 0; x < length; x++) { - struct gfs2_bitmap *bi = rgd->rd_bits + x; - if (!bi->bi_clone) + u32 trimmed = 0; + u8 diff; + + for (x = 0; x < bi->bi_len; x++) { + const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; + clone += bi->bi_offset; + clone += x; + if (bh) { + const u8 *orig = bh->b_data + bi->bi_offset + x; + diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + } else { + diff = ~(*clone | (*clone >> 1)); + } + diff &= 0x55; + if (diff == 0) continue; - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + blk = offset + ((bi->bi_start + x) * GFS2_NBBY); + while(diff) { + if (diff & 1) { + if (nr_blks == 0) + goto start_new_extent; + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, + GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + nr_blks = 0; +start_new_extent: + start = blk; + } + nr_blks++; + } + diff >>= 2; + blk++; + } } + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + if (ptrimmed) + *ptrimmed = trimmed; + return 0; - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; - spin_unlock(&sdp->sd_rindex_spin); +fail: + if (sdp->sd_args.ar_discard) + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + sdp->sd_args.ar_discard = 0; + return -EIO; } /** - * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode - * @ip: the incore GFS2 inode structure + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem + * @filp: Any file on the filesystem + * @argp: Pointer to the arguments (also used to pass result) * - * Returns: the struct gfs2_alloc + * Returns: 0 on success, otherwise error code */ -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +int gfs2_fitrim(struct file *filp, void __user *argp) { - BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); - return ip->i_alloc; -} + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct buffer_head *bh; + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd *rgd_end; + struct gfs2_holder gh; + struct fstrim_range r; + int ret = 0; + u64 amt; + u64 trimmed = 0; + u64 start, end, minlen; + unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; -/** - * try_rgrp_fit - See if a given reservation will fit in a given RG - * @rgd: the RG data - * @al: the struct gfs2_alloc structure describing the reservation - * - * If there's room for the requested blocks to be allocated from the RG: - * Sets the $al_rgd field in @al. - * - * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) - */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; -static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) -{ - struct gfs2_sbd *sdp = rgd->rd_sbd; - int ret = 0; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; - if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC) - return 0; + if (copy_from_user(&r, argp, sizeof(r))) + return -EFAULT; - spin_lock(&sdp->sd_rindex_spin); - if (rgd->rd_free_clone >= al->al_requested) { - al->al_rgd = rgd; - ret = 1; + ret = gfs2_rindex_update(sdp); + if (ret) + return ret; + + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, + q->limits.discard_granularity) >> bs_shift; + + if (end <= start || minlen > sdp->sd_max_rg_data) + return -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end, 0); + + if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end)) + && (start > rgd_end->rd_data0 + rgd_end->rd_data)) + return -EINVAL; /* start is beyond the end of the fs */ + + while (1) { + + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto out; + + if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { + /* Trim each bitmap in the rgrp */ + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); + if (ret) { + gfs2_glock_dq_uninit(&gh); + goto out; + } + trimmed += amt; + } + + /* Mark rgrp as having been trimmed */ + ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); + if (ret == 0) { + bh = rgd->rd_bits[0].bi_bh; + rgd->rd_flags |= GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, bh); + gfs2_rgrp_out(rgd, bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); + gfs2_trans_end(sdp); + } + } + gfs2_glock_dq_uninit(&gh); + + if (rgd == rgd_end) + break; + + rgd = gfs2_rgrpd_get_next(rgd); } - spin_unlock(&sdp->sd_rindex_spin); + +out: + r.len = trimmed << bs_shift; + if (copy_to_user(argp, &r, sizeof(r))) + return -EFAULT; return ret; } /** - * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes - * @rgd: The rgrp + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree + * @ip: the inode structure * - * Returns: The inode, if one has been found */ - -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) +static void rs_insert(struct gfs2_inode *ip) { - struct inode *inode; - u32 goal = 0, block; - u64 no_addr; - struct gfs2_sbd *sdp = rgd->rd_sbd; - - for(;;) { - if (goal >= rgd->rd_data) - break; - down_write(&sdp->sd_log_flush_lock); - block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, - GFS2_BLKST_UNLINKED); - up_write(&sdp->sd_log_flush_lock); - if (block == BFITNOENT) - break; - /* rgblk_search can return a block < goal, so we need to - keep it marching forward. */ - no_addr = block + rgd->rd_data0; - goal++; - if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) - continue; - *last_unlinked = no_addr; - inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, - no_addr, -1, 1); - if (!IS_ERR(inode)) - return inode; + struct rb_node **newn, *parent = NULL; + int rc; + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; + u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); + + BUG_ON(gfs2_rs_active(rs)); + + spin_lock(&rgd->rd_rsspin); + newn = &rgd->rd_rstree.rb_node; + while (*newn) { + struct gfs2_blkreserv *cur = + rb_entry(*newn, struct gfs2_blkreserv, rs_node); + + parent = *newn; + rc = rs_cmp(fsblock, rs->rs_free, cur); + if (rc > 0) + newn = &((*newn)->rb_right); + else if (rc < 0) + newn = &((*newn)->rb_left); + else { + spin_unlock(&rgd->rd_rsspin); + WARN_ON(1); + return; + } } - rgd->rd_flags &= ~GFS2_RDF_CHECK; - return NULL; + rb_link_node(&rs->rs_node, parent, newn); + rb_insert_color(&rs->rs_node, &rgd->rd_rstree); + + /* Do our rgrp accounting for the reservation */ + rgd->rd_reserved += rs->rs_free; /* blocks reserved */ + spin_unlock(&rgd->rd_rsspin); + trace_gfs2_rs(rs, TRACE_RS_INSERT); } /** - * recent_rgrp_first - get first RG from "recent" list - * @sdp: The GFS2 superblock - * @rglast: address of the rgrp used last + * rg_mblk_search - find a group of multiple free blocks to form a reservation + * @rgd: the resource group descriptor + * @ip: pointer to the inode for which we're reserving blocks + * @ap: the allocation parameters * - * Returns: The first rgrp in the recent list */ -static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, - u64 rglast) +static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap) { - struct gfs2_rgrpd *rgd = NULL; + struct gfs2_rbm rbm = { .rgd = rgd, }; + u64 goal; + struct gfs2_blkreserv *rs = ip->i_res; + u32 extlen; + u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; + int ret; + struct inode *inode = &ip->i_inode; - spin_lock(&sdp->sd_rindex_spin); + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + } + if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + return; - if (list_empty(&sdp->sd_rindex_recent_list)) - goto out; + /* Find bitmap block that contains bits for goal block */ + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rgd->rd_last_alloc + rgd->rd_data0; - if (!rglast) - goto first; + if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) + return; - list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgd->rd_addr == rglast) - goto out; + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); + if (ret == 0) { + rs->rs_rbm = rbm; + rs->rs_free = extlen; + rs->rs_inum = ip->i_no_addr; + rs_insert(ip); + } else { + if (goal == rgd->rd_last_alloc + rgd->rd_data0) + rgd->rd_last_alloc = 0; } - -first: - rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd, - rd_recent); -out: - spin_unlock(&sdp->sd_rindex_spin); - return rgd; } /** - * recent_rgrp_next - get next RG from "recent" list - * @cur_rgd: current rgrp - * @remove: + * gfs2_next_unreserved_block - Return next block that is not reserved + * @rgd: The resource group + * @block: The starting block + * @length: The required length + * @ip: Ignore any reservations for this inode * - * Returns: The next rgrp in the recent list + * If the block does not appear in any reservation, then return the + * block number unchanged. If it does appear in the reservation, then + * keep looking through the tree of reservations in order to find the + * first block number which is not reserved. */ -static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, - int remove) +static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, + u32 length, + const struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = cur_rgd->rd_sbd; - struct list_head *head; - struct gfs2_rgrpd *rgd; + struct gfs2_blkreserv *rs; + struct rb_node *n; + int rc; + + spin_lock(&rgd->rd_rsspin); + n = rgd->rd_rstree.rb_node; + while (n) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + rc = rs_cmp(block, length, rs); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else + break; + } - spin_lock(&sdp->sd_rindex_spin); + if (n) { + while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) { + block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; + n = n->rb_right; + if (n == NULL) + break; + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + } + } - head = &sdp->sd_rindex_recent_list; + spin_unlock(&rgd->rd_rsspin); + return block; +} - list_for_each_entry(rgd, head, rd_recent) { - if (rgd == cur_rgd) { - if (cur_rgd->rd_recent.next != head) - rgd = list_entry(cur_rgd->rd_recent.next, - struct gfs2_rgrpd, rd_recent); - else - rgd = NULL; +/** + * gfs2_reservation_check_and_update - Check for reservations during block alloc + * @rbm: The current position in the resource group + * @ip: The inode for which we are searching for blocks + * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure + * + * This checks the current position in the rgrp to see whether there is + * a reservation covering this block. If not then this function is a + * no-op. If there is, then the position is moved to the end of the + * contiguous reservation(s) so that we are pointing at the first + * non-reserved block. + * + * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error + */ - if (remove) - list_del(&cur_rgd->rd_recent); +static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, + u32 minext, + struct gfs2_extent *maxext) +{ + u64 block = gfs2_rbm_to_block(rbm); + u32 extlen = 1; + u64 nblock; + int ret; + + /* + * If we have a minimum extent length, then skip over any extent + * which is less than the min extent length in size. + */ + if (minext) { + extlen = gfs2_free_extlen(rbm, minext); + if (extlen <= maxext->len) + goto fail; + } - goto out; + /* + * Check the extent which has been found against the reservations + * and skip if parts of it are already reserved + */ + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; } +fail: + nblock = block + extlen; } - - rgd = NULL; - if (!list_empty(head)) - rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); - -out: - spin_unlock(&sdp->sd_rindex_spin); - return rgd; + ret = gfs2_rbm_from_block(rbm, nblock); + if (ret < 0) + return ret; + return 1; } /** - * recent_rgrp_add - add an RG to tail of "recent" list - * @new_rgd: The rgrp to add + * gfs2_rbm_find - Look for blocks of a particular state + * @rbm: Value/result starting position and final position + * @state: The state which we want to find + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. + * @ip: If set, check for reservations + * @nowrap: Stop looking at the end of the rgrp, rather than wrapping + * around until we've reached the starting point. + * @ap: the allocation parameters * + * Side effects: + * - If looking for free blocks, we set GBF_FULL on each bitmap which + * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. + * + * Returns: 0 on success, -ENOSPC if there is no block of the requested state */ -static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd) +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) { - struct gfs2_sbd *sdp = new_rgd->rd_sbd; - struct gfs2_rgrpd *rgd; - unsigned int count = 0; - unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp); + struct buffer_head *bh; + int initial_bii; + u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; + u32 offset; + u8 *buffer; + int n = 0; + int iters = rbm->rgd->rd_length; + int ret; + struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; + + /* If we are not starting at the beginning of a bitmap, then we + * need to add one to the bitmap count to ensure that we search + * the starting bitmap twice. + */ + if (rbm->offset != 0) + iters++; + + while(1) { + bi = rbm_bi(rbm); + if (test_bit(GBF_FULL, &bi->bi_flags) && + (state == GFS2_BLKST_FREE)) + goto next_bitmap; + + bh = bi->bi_bh; + buffer = bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bh)); + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; + initial_offset = rbm->offset; + offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state); + if (offset == BFITNOENT) + goto bitmap_full; + rbm->offset = offset; + if (ip == NULL) + return 0; + + initial_bii = rbm->bii; + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); + if (ret == 0) + return 0; + if (ret > 0) { + n += (rbm->bii - initial_bii); + goto next_iter; + } + if (ret == -E2BIG) { + rbm->bii = 0; + rbm->offset = 0; + n += (rbm->bii - initial_bii); + goto res_covered_end_of_rgrp; + } + return ret; - spin_lock(&sdp->sd_rindex_spin); +bitmap_full: /* Mark bitmap as full and fall through */ + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { + struct gfs2_bitmap *bi = rbm_bi(rbm); + set_bit(GBF_FULL, &bi->bi_flags); + } - list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { - if (rgd == new_rgd) - goto out; +next_bitmap: /* Find next bitmap in the rgrp */ + rbm->offset = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->rd_length) + rbm->bii = 0; +res_covered_end_of_rgrp: + if ((rbm->bii == 0) && nowrap) + break; + n++; +next_iter: + if (n >= iters) + break; + } - if (++count >= max) - goto out; + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; } - list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list); -out: - spin_unlock(&sdp->sd_rindex_spin); + return -ENOSPC; } /** - * forward_rgrp_get - get an rgrp to try next from full list - * @sdp: The GFS2 superblock + * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes + * @rgd: The rgrp + * @last_unlinked: block address of the last dinode we unlinked + * @skip: block address we should explicitly not unlink * - * Returns: The rgrp to try next + * Returns: 0 if no error + * The inode, if one has been found, in inode. */ -static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { - struct gfs2_rgrpd *rgd; - unsigned int journals = gfs2_jindex_size(sdp); - unsigned int rg = 0, x; + u64 block; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; + struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; - spin_lock(&sdp->sd_rindex_spin); + while (1) { + down_write(&sdp->sd_log_flush_lock); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); + up_write(&sdp->sd_log_flush_lock); + if (error == -ENOSPC) + break; + if (WARN_ON_ONCE(error)) + break; - rgd = sdp->sd_rindex_forward; - if (!rgd) { - if (sdp->sd_rgrps >= journals) - rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + 1)) + break; + if (*last_unlinked != NO_BLOCK && block <= *last_unlinked) + continue; + if (block == skip) + continue; + *last_unlinked = block; - for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; - x++, rgd = gfs2_rgrpd_get_next(rgd)) - /* Do Nothing */; + error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; - sdp->sd_rindex_forward = rgd; - } + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; - spin_unlock(&sdp->sd_rindex_spin); + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; - return rgd; + /* Limit reclaim to sensible number of tasks */ + if (found > NR_CPUS) + return; + } + + rgd->rd_flags &= ~GFS2_RDF_CHECK; + return; } /** - * forward_rgrp_set - set the forward rgrp pointer - * @sdp: the filesystem - * @rgd: The new forward rgrp + * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested + * @rgd: The rgrp in question + * @loops: An indication of how picky we can be (0=very, 1=less so) + * + * This function uses the recently added glock statistics in order to + * figure out whether a parciular resource group is suffering from + * contention from multiple nodes. This is done purely on the basis + * of timings, since this is the only data we have to work with and + * our aim here is to reject a resource group which is highly contended + * but (very important) not to do this too often in order to ensure that + * we do not land up introducing fragmentation by changing resource + * groups when not actually required. + * + * The calculation is fairly simple, we want to know whether the SRTTB + * (i.e. smoothed round trip time for blocking operations) to acquire + * the lock for this rgrp's glock is significantly greater than the + * time taken for resource groups on average. We introduce a margin in + * the form of the variable @var which is computed as the sum of the two + * respective variences, and multiplied by a factor depending on @loops + * and whether we have a lot of data to base the decision on. This is + * then tested against the square difference of the means in order to + * decide whether the result is statistically significant or not. * + * Returns: A boolean verdict on the congestion status */ -static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) { - spin_lock(&sdp->sd_rindex_spin); - sdp->sd_rindex_forward = rgd; - spin_unlock(&sdp->sd_rindex_spin); + const struct gfs2_glock *gl = rgd->rd_gl; + const struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_lkstats *st; + s64 r_dcount, l_dcount; + s64 r_srttb, l_srttb; + s64 srttb_diff; + s64 sqr_diff; + s64 var; + + preempt_disable(); + st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; + r_srttb = st->stats[GFS2_LKS_SRTTB]; + r_dcount = st->stats[GFS2_LKS_DCOUNT]; + var = st->stats[GFS2_LKS_SRTTVARB] + + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + preempt_enable(); + + l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + + if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + return false; + + srttb_diff = r_srttb - l_srttb; + sqr_diff = srttb_diff * srttb_diff; + + var *= 2; + if (l_dcount < 8 || r_dcount < 8) + var *= 2; + if (loops == 1) + var *= 2; + + return ((srttb_diff < 0) && (sqr_diff > var)); } /** - * get_local_rgrp - Choose and lock a rgrp for allocation - * @ip: the inode to reserve space for - * @rgp: the chosen and locked rgrp + * gfs2_rgrp_used_recently + * @rs: The block reservation with the rgrp to test + * @msecs: The time limit in milliseconds * - * Try to acquire rgrp in way which avoids contending with others. - * - * Returns: errno + * Returns: True if the rgrp glock has been used within the time limit */ - -static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, + u64 msecs) { - struct inode *inode = NULL; - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd, *begin = NULL; - struct gfs2_alloc *al = ip->i_alloc; - int flags = LM_FLAG_TRY; - int skipped = 0; - int loops = 0; - int error, rg_locked; - - /* Try recently successful rgrps */ - - rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); - - while (rgd) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_TRY, &al->al_rgd_gh); - } - switch (error) { - case 0: - if (try_rgrp_fit(rgd, al)) - goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); - if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; - rgd = recent_rgrp_next(rgd, 1); - break; - - case GLR_TRYFAILED: - rgd = recent_rgrp_next(rgd, 0); - break; - - default: - return ERR_PTR(error); - } - } - - /* Go through full list of rgrps */ - - begin = rgd = forward_rgrp_get(sdp); - - for (;;) { - rg_locked = 0; - - if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { - rg_locked = 1; - error = 0; - } else { - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, - &al->al_rgd_gh); - } - switch (error) { - case 0: - if (try_rgrp_fit(rgd, al)) - goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); - if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; - break; + u64 tdiff; - case GLR_TRYFAILED: - skipped++; - break; + tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), + rs->rs_rbm.rgd->rd_gl->gl_dstamp)); - default: - return ERR_PTR(error); - } + return tdiff > (msecs * 1000 * 1000); +} - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - - if (rgd == begin) { - if (++loops >= 3) - return ERR_PTR(-ENOSPC); - if (!skipped) - loops++; - flags = 0; - if (loops == 2) - gfs2_log_flush(sdp, NULL); - } - } +static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 skip; -out: - ip->i_last_rg_alloc = rgd->rd_addr; + get_random_bytes(&skip, sizeof(skip)); + return skip % sdp->sd_rgrps; +} - if (begin) { - recent_rgrp_add(rgd); - rgd = gfs2_rgrpd_get_next(rgd); - if (!rgd) - rgd = gfs2_rgrpd_get_first(sdp); - forward_rgrp_set(sdp, rgd); - } +static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) +{ + struct gfs2_rgrpd *rgd = *pos; + struct gfs2_sbd *sdp = rgd->rd_sbd; - return NULL; + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == NULL) + rgd = gfs2_rgrpd_get_first(sdp); + *pos = rgd; + if (rgd != begin) /* If we didn't wrap */ + return true; + return false; } /** - * gfs2_inplace_reserve_i - Reserve space in the filesystem + * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for + * @ap: the allocation parameters * * Returns: errno */ -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) +int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct inode *inode; - int error = 0; + struct gfs2_rgrpd *begin = NULL; + struct gfs2_blkreserv *rs = ip->i_res; + int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; + int loops = 0; + u32 skip = 0; - if (gfs2_assert_warn(sdp, al->al_requested)) + if (sdp->sd_args.ar_rgrplvb) + flags |= GL_SKIP; + if (gfs2_assert_warn(sdp, ap->target)) return -EINVAL; + if (gfs2_rs_active(rs)) { + begin = rs->rs_rbm.rgd; + } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { + rs->rs_rbm.rgd = begin = ip->i_rgd; + } else { + rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + } + if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) + skip = gfs2_orlov_skip(ip); + if (rs->rs_rbm.rgd == NULL) + return -EBADSLT; + + while (loops < 3) { + rg_locked = 1; + + if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { + rg_locked = 0; + if (skip && skip--) + goto next_rgrp; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; + error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, + LM_ST_EXCLUSIVE, flags, + &rs->rs_rgd_gh); + if (unlikely(error)) + return error; + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto skip_rgrp; + if (sdp->sd_args.ar_rgrplvb) { + error = update_rgrp_lvb(rs->rs_rbm.rgd); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + return error; + } + } + } -try_again: - /* We need to hold the rindex unless the inode we're using is - the rindex itself, in which case it's already held. */ - if (ip != GFS2_I(sdp->sd_rindex)) - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + /* Skip unuseable resource groups */ + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) + goto skip_rgrp; - if (error) - return error; + if (sdp->sd_args.ar_rgrplvb) + gfs2_rgrp_bh_get(rs->rs_rbm.rgd); - inode = get_local_rgrp(ip, &last_unlinked); - if (inode) { - if (ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&al->al_ri_gh); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iput(inode); - gfs2_log_flush(sdp, NULL); - goto try_again; - } + /* Get a reservation if we don't already have one */ + if (!gfs2_rs_active(rs)) + rg_mblk_search(rs->rs_rbm.rgd, ip, ap); - al->al_file = file; - al->al_line = line; + /* Skip rgrps when we can't get a reservation on first pass */ + if (!gfs2_rs_active(rs) && (loops < 1)) + goto check_rgrp; - return 0; + /* If rgrp has enough free space, use it */ + if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { + ip->i_rgd = rs->rs_rbm.rgd; + return 0; + } + +check_rgrp: + /* Check for unlinked inodes which can be reclaimed */ + if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, + ip->i_no_addr); +skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + + /* Unlock rgrp if required */ + if (!rg_locked) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); +next_rgrp: + /* Find the next rgrp, and continue looking */ + if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) + continue; + if (skip) + continue; + + /* If we've scanned all the rgrps, but found no free blocks + * then this checks for some less likely conditions before + * trying again. + */ + loops++; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + } + /* Flushing the log may release space */ + if (loops == 2) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + } + + return -ENOSPC; } /** @@ -1219,20 +2016,10 @@ try_again: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - - if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) - fs_warn(sdp, "al_alloced = %u, al_requested = %u " - "al_file = %s, al_line = %u\n", - al->al_alloced, al->al_requested, al->al_file, - al->al_line); + struct gfs2_blkreserv *rs = ip->i_res; - al->al_rgd = NULL; - if (al->al_rgd_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (ip != GFS2_I(sdp->sd_rindex)) - gfs2_glock_dq_uninit(&al->al_ri_gh); + if (rs->rs_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); } /** @@ -1243,105 +2030,49 @@ void gfs2_inplace_release(struct gfs2_inode *ip) * Returns: The block type (GFS2_BLKST_*) */ -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) { - struct gfs2_bitmap *bi = NULL; - u32 length, rgrp_block, buf_block; - unsigned int buf; - unsigned char type; - - length = rgd->rd_length; - rgrp_block = block - rgd->rd_data0; - - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; - } - - gfs2_assert(rgd->rd_sbd, buf < length); - buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; + struct gfs2_rbm rbm = { .rgd = rgd, }; + int ret; - type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, buf_block); + ret = gfs2_rbm_from_block(&rbm, block); + WARN_ON_ONCE(ret != 0); - return type; + return gfs2_testbit(&rbm); } + /** - * rgblk_search - find a block in @old_state, change allocation - * state to @new_state - * @rgd: the resource group descriptor - * @goal: the goal block within the RG (start here to search for avail block) - * @old_state: GFS2_BLKST_XXX the before-allocation state to find - * @new_state: GFS2_BLKST_XXX the after-allocation block state + * gfs2_alloc_extent - allocate an extent from a given bitmap + * @rbm: the resource group information + * @dinode: TRUE if the first block we allocate is for a dinode + * @n: The extent length (value/result) * - * Walk rgrp's bitmap to find bits that represent a block in @old_state. - * Add the found bitmap buffer to the transaction. + * Add the bitmap buffer to the transaction. * Set the found bits to @new_state to change block's allocation state. - * - * This function never fails, because we wouldn't call it unless we - * know (from reservation results, etc.) that a block is available. - * - * Scope of @goal and returned block is just within rgrp, not the whole - * filesystem. - * - * Returns: the block number allocated */ - -static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state) +static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, + unsigned int *n) { - struct gfs2_bitmap *bi = NULL; - u32 length = rgd->rd_length; - u32 blk = 0; - unsigned int buf, x; - - /* Find bitmap block that contains bits for goal block */ - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; - } - - gfs2_assert(rgd->rd_sbd, buf < length); - - /* Convert scope of "goal" from rgrp-wide to within found bit block */ - goal -= bi->bi_start * GFS2_NBBY; - - /* Search (up to entire) bitmap in this rgrp for allocatable block. - "x <= length", instead of "x < length", because we typically start - the search in the middle of a bit block, but if we can't find an - allocatable block anywhere else, we want to be able wrap around and - search in the first part of our first-searched bit block. */ - for (x = 0; x <= length; x++) { - /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone - bitmaps, so we must search the originals for that. */ - if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) - blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset, - bi->bi_len, goal, old_state); - else - blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, goal, old_state); - if (blk != BFITNOENT) + struct gfs2_rbm pos = { .rgd = rbm->rgd, }; + const unsigned int elen = *n; + u64 block; + int ret; + + *n = 1; + block = gfs2_rbm_to_block(rbm); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh); + gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + block++; + while (*n < elen) { + ret = gfs2_rbm_from_block(&pos, block); + if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - - /* Try next bitmap block (wrap back to rgrp header if at end) */ - buf = (buf + 1) % length; - bi = rgd->rd_bits + buf; - goal = 0; + gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); + gfs2_setbit(&pos, true, GFS2_BLKST_USED); + (*n)++; + block++; } - - if (blk != BFITNOENT && old_state != new_state) { - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, blk, new_state); - if (bi->bi_clone) - gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, - bi->bi_len, blk, new_state); - } - - return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; } /** @@ -1357,191 +2088,241 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { - struct gfs2_rgrpd *rgd; - struct gfs2_bitmap *bi = NULL; - u32 length, rgrp_blk, buf_blk; - unsigned int buf; + struct gfs2_rbm rbm; + struct gfs2_bitmap *bi; - rgd = gfs2_blk2rgrpd(sdp, bstart); - if (!rgd) { + rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); + if (!rbm.rgd) { if (gfs2_consist(sdp)) fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); return NULL; } - length = rgd->rd_length; - - rgrp_blk = bstart - rgd->rd_data0; - while (blen--) { - for (buf = 0; buf < length; buf++) { - bi = rgd->rd_bits + buf; - if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; - } - - gfs2_assert(rgd->rd_sbd, buf < length); - - buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY; - rgrp_blk++; - + gfs2_rbm_from_block(&rbm, bstart); + bi = rbm_bi(&rbm); + bstart++; if (!bi->bi_clone) { bi->bi_clone = kmalloc(bi->bi_bh->b_size, GFP_NOFS | __GFP_NOFAIL); memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len); + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, - bi->bi_len, buf_blk, new_state); + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); + gfs2_setbit(&rbm, false, new_state); } - return rgd; + return rbm.rgd; } /** - * gfs2_alloc_data - Allocate a data block - * @ip: the inode to allocate the data block for + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question * - * Returns: the allocated block */ -u64 gfs2_alloc_data(struct gfs2_inode *ip) +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; - u32 goal, blk; - u64 block; - - if (rgrp_contains_block(rgd, ip->i_di.di_goal_data)) - goal = ip->i_di.di_goal_data - rgd->rd_data0; - else - goal = rgd->rd_last_alloc_data; - - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc_data = blk; - - block = rgd->rd_data0 + blk; - ip->i_di.di_goal_data = block; - - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_blkreserv *trs; + const struct rb_node *n; - al->al_alloced++; - - gfs2_statfs_change(sdp, 0, -1, 0); - gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid); - - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); + if (rgd == NULL) + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, + rgd->rd_reserved, rgd->rd_extfail_pt); + spin_lock(&rgd->rd_rsspin); + for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { + trs = rb_entry(n, struct gfs2_blkreserv, rs_node); + dump_rs(seq, trs); + } + spin_unlock(&rgd->rd_rsspin); +} - return block; +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; } /** - * gfs2_alloc_meta - Allocate a metadata block - * @ip: the inode to allocate the metadata block for + * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation + * @ip: The inode we have just allocated blocks for + * @rbm: The start of the allocated blocks + * @len: The extent length * - * Returns: the allocated block + * Adjusts a reservation after an allocation has taken place. If the + * reservation does not match the allocation, or if it is now empty + * then it is removed. */ -u64 gfs2_alloc_meta(struct gfs2_inode *ip) +static void gfs2_adjust_reservation(struct gfs2_inode *ip, + const struct gfs2_rbm *rbm, unsigned len) { - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; - u32 goal, blk; + struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_rgrpd *rgd = rbm->rgd; + unsigned rlen; u64 block; + int ret; + + spin_lock(&rgd->rd_rsspin); + if (gfs2_rs_active(rs)) { + if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) { + block = gfs2_rbm_to_block(rbm); + ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len); + rlen = min(rs->rs_free, len); + rs->rs_free -= rlen; + rgd->rd_reserved -= rlen; + trace_gfs2_rs(rs, TRACE_RS_CLAIM); + if (rs->rs_free && !ret) + goto out; + } + __rs_deltree(rs); + } +out: + spin_unlock(&rgd->rd_rsspin); +} - if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta)) - goal = ip->i_di.di_goal_meta - rgd->rd_data0; - else - goal = rgd->rd_last_alloc_meta; - - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc_meta = blk; - - block = rgd->rd_data0 + blk; - ip->i_di.di_goal_meta = block; - - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); +/** + * gfs2_set_alloc_start - Set starting point for block allocation + * @rbm: The rbm which will be set to the required location + * @ip: The gfs2 inode + * @dinode: Flag to say if allocation includes a new inode + * + * This sets the starting point from the reservation if one is active + * otherwise it falls back to guessing a start point based on the + * inode's goal block or the last allocation point in the rgrp. + */ - al->al_alloced++; +static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, bool dinode) +{ + u64 goal; - gfs2_statfs_change(sdp, 0, -1, 0); - gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_trans_add_unrevoke(sdp, block); + if (gfs2_rs_active(ip->i_res)) { + *rbm = ip->i_res->rs_rbm; + return; + } - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; - return block; + gfs2_rbm_from_block(rbm, goal); } /** - * gfs2_alloc_di - Allocate a dinode - * @dip: the directory that the inode is going in + * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode + * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @nblocks: requested number of blocks/extent length (value/result) + * @dinode: 1 if we're allocating a dinode block, else 0 + * @generation: the generation number of the inode * - * Returns: the block allocated + * Returns: 0 or error */ -u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, + bool dinode, u64 *generation) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al = dip->i_alloc; - struct gfs2_rgrpd *rgd = al->al_rgd; - u32 blk; - u64 block; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; + unsigned int ndata; + u64 block; /* block, within the file system scope */ + int error; - blk = rgblk_search(rgd, rgd->rd_last_alloc_meta, - GFS2_BLKST_FREE, GFS2_BLKST_DINODE); - BUG_ON(blk == BFITNOENT); + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); - rgd->rd_last_alloc_meta = blk; + if (error == -ENOSPC) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); + } - block = rgd->rd_data0 + blk; + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (error) { + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", + (unsigned long long)ip->i_no_addr, error, *nblocks, + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); + goto rgrp_error; + } - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - rgd->rd_rg.rg_dinodes++; - *generation = rgd->rd_rg.rg_igeneration++; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_alloc_extent(&rbm, dinode, nblocks); + block = gfs2_rbm_to_block(&rbm); + rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; + if (gfs2_rs_active(ip->i_res)) + gfs2_adjust_reservation(ip, &rbm, *nblocks); + ndata = *nblocks; + if (dinode) + ndata--; + + if (!dinode) { + ip->i_goal = block + ndata - 1; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = + (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_meta(ip->i_gl, dibh); + di->di_goal_meta = di->di_goal_data = + cpu_to_be64(ip->i_goal); + brelse(dibh); + } + } + if (rbm.rgd->rd_free < *nblocks) { + pr_warn("nblocks=%u\n", *nblocks); + goto rgrp_error; + } - al->al_alloced++; + rbm.rgd->rd_free -= *nblocks; + if (dinode) { + rbm.rgd->rd_dinodes++; + *generation = rbm.rgd->rd_igeneration++; + if (*generation == 0) + *generation = rbm.rgd->rd_igeneration++; + } - gfs2_statfs_change(sdp, 0, -1, +1); - gfs2_trans_add_unrevoke(sdp, block); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); - spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone--; - spin_unlock(&sdp->sd_rindex_spin); + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); + if (dinode) + gfs2_trans_add_unrevoke(sdp, block, *nblocks); - return block; + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); + + rbm.rgd->rd_free_clone -= *nblocks; + trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rbm.rgd); + return -EIO; } /** - * gfs2_free_data - free a contiguous run of data block(s) + * __gfs2_free_blocks - free a contiguous run of block(s) * @ip: the inode these blocks are being freed from * @bstart: first block of a run of contiguous blocks * @blen: the length of the block run + * @meta: 1 if the blocks represent metadata * */ -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1549,16 +2330,16 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); if (!rgd) return; - - rgd->rd_rg.rg_free += blen; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); - - gfs2_trans_add_rg(rgd); - - gfs2_statfs_change(sdp, 0, +blen, 0); - gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); + trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); + rgd->rd_free += blen; + rgd->rd_flags &= ~GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + + /* Directories keep their data in the metadata address space */ + if (meta || ip->i_depth) + gfs2_meta_wipe(ip, bstart, blen); } /** @@ -1572,22 +2353,10 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd; - - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); - if (!rgd) - return; - - rgd->rd_rg.rg_free += blen; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); - - gfs2_trans_add_rg(rgd); + __gfs2_free_blocks(ip, bstart, blen, 1); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, bstart, blen); } void gfs2_unlink_di(struct inode *inode) @@ -1600,9 +2369,11 @@ void gfs2_unlink_di(struct inode *inode) rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); if (!rgd) return; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); - gfs2_trans_add_rg(rgd); + trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, 1); } static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) @@ -1615,29 +2386,64 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) return; gfs2_assert_withdraw(sdp, rgd == tmp_rgd); - if (!rgd->rd_rg.rg_dinodes) + if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); - rgd->rd_rg.rg_dinodes--; - rgd->rd_rg.rg_free++; + rgd->rd_dinodes--; + rgd->rd_free++; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); + update_rgrp_lvb_unlinked(rgd, -1); gfs2_statfs_change(sdp, 0, +1, -1); - gfs2_trans_add_rg(rgd); } void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { gfs2_free_uninit_di(rgd, ip->i_no_addr); + trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); gfs2_meta_wipe(ip, ip->i_no_addr, 1); } /** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder rgd_gh; + int error = -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); + if (!rgd) + goto fail; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail: + return error; +} + +/** * gfs2_rlist_add - add a RG to a list of RGs - * @sdp: the filesystem + * @ip: the inode * @rlist: the list of resource groups * @block: the block * @@ -1647,9 +2453,10 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) * */ -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, u64 block) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_rgrpd **tmp; unsigned int new_space; @@ -1658,12 +2465,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) return; - rgd = gfs2_blk2rgrpd(sdp, block); + if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) + rgd = ip->i_rgd; + else + rgd = gfs2_blk2rgrpd(sdp, block, 1); if (!rgd) { - if (gfs2_consist(sdp)) - fs_err(sdp, "block = %llu\n", (unsigned long long)block); + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); return; } + ip->i_rgd = rgd; for (x = 0; x < rlist->rl_rgrps; x++) if (rlist->rl_rgd[x] == rgd) @@ -1693,14 +2503,12 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, * and initialize an array of glock holders for them * @rlist: the list of resource groups * @state: the lock state to acquire the RG lock in - * @flags: the modifier flags for the holder structures * * FIXME: Don't use NOFAIL * */ -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, - int flags) +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; @@ -1708,13 +2516,13 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - state, flags, + state, 0, &rlist->rl_ghs[x]); } /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ @@ -1728,6 +2536,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_uninit(&rlist->rl_ghs[x]); kfree(rlist->rl_ghs); + rlist->rl_ghs = NULL; } } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 149bb161f4b..463ab2e95d1 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -10,50 +10,51 @@ #ifndef __RGRP_DOT_H__ #define __RGRP_DOT_H__ +#include <linux/slab.h> +#include <linux/uaccess.h> + +/* Since each block in the file system is represented by two bits in the + * bitmap, one 64-bit word in the bitmap will represent 32 blocks. + * By reserving 32 blocks at a time, we can optimize / shortcut how we search + * through the bitmaps by looking a word at a time. + */ +#define RGRP_RSRV_MINBYTES 8 +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY)) + struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; -void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); -struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); - -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -static inline void gfs2_alloc_put(struct gfs2_inode *ip) -{ - BUG_ON(ip->i_alloc == NULL); - kfree(ip->i_alloc); - ip->i_alloc = NULL; -} +#define GFS2_AF_ORLOV 1 +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); +extern void gfs2_inplace_release(struct gfs2_inode *ip); -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, - char *file, unsigned int line); -#define gfs2_inplace_reserve(ip) \ -gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) +extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode, u64 *generation); -void gfs2_inplace_release(struct gfs2_inode *ip); - -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); - -u64 gfs2_alloc_data(struct gfs2_inode *ip); -u64 gfs2_alloc_meta(struct gfs2_inode *ip); -u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); - -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); -void gfs2_unlink_di(struct inode *inode); +extern int gfs2_rs_alloc(struct gfs2_inode *ip); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); +extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; @@ -62,11 +63,21 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, - u64 block); -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, - int flags); -void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); -u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); +extern int gfs2_fitrim(struct file *filp, void __user *argp); + +/* This is how to tell if a reservation is in the rgrp tree: */ +static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) +{ + return rs && !RB_EMPTY_NODE(&rs->rs_node); +} #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ef0562c3bc7..1319b5c4ec6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -7,15 +7,25 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/bio.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> -#include <linux/crc32.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/kthread.h> +#include <linux/delay.h> #include <linux/gfs2_ondisk.h> -#include <linux/bio.h> -#include <linux/lm_interface.h> +#include <linux/crc32.h> +#include <linux/time.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> #include "gfs2.h" #include "incore.h" @@ -32,381 +42,249 @@ #include "super.h" #include "trans.h" #include "util.h" - -static const u32 gfs2_old_fs_formats[] = { - 0 +#include "sys.h" +#include "xattr.h" + +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + +enum { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_nodebug, + Opt_upgrade, + Opt_acl, + Opt_noacl, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, + Opt_quota, + Opt_noquota, + Opt_suiddir, + Opt_nosuiddir, + Opt_data_writeback, + Opt_data_ordered, + Opt_meta, + Opt_discard, + Opt_nodiscard, + Opt_commit, + Opt_err_withdraw, + Opt_err_panic, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_nobarrier, + Opt_rgrplvb, + Opt_norgrplvb, + Opt_error, }; -static const u32 gfs2_old_multihost_formats[] = { - 0 +static const match_table_t tokens = { + {Opt_lockproto, "lockproto=%s"}, + {Opt_locktable, "locktable=%s"}, + {Opt_hostdata, "hostdata=%s"}, + {Opt_spectator, "spectator"}, + {Opt_spectator, "norecovery"}, + {Opt_ignore_local_fs, "ignore_local_fs"}, + {Opt_localflocks, "localflocks"}, + {Opt_localcaching, "localcaching"}, + {Opt_debug, "debug"}, + {Opt_nodebug, "nodebug"}, + {Opt_upgrade, "upgrade"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_quota_off, "quota=off"}, + {Opt_quota_account, "quota=account"}, + {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_suiddir, "suiddir"}, + {Opt_nosuiddir, "nosuiddir"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, + {Opt_statfs_quantum, "statfs_quantum=%d"}, + {Opt_statfs_percent, "statfs_percent=%d"}, + {Opt_quota_quantum, "quota_quantum=%d"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_rgrplvb, "rgrplvb"}, + {Opt_norgrplvb, "norgrplvb"}, + {Opt_error, NULL} }; /** - * gfs2_tune_init - Fill a gfs2_tune structure with default values - * @gt: tune - * - */ - -void gfs2_tune_init(struct gfs2_tune *gt) -{ - spin_lock_init(>->gt_spin); - - gt->gt_demote_secs = 300; - gt->gt_incore_log_blocks = 1024; - gt->gt_log_flush_secs = 60; - gt->gt_recoverd_secs = 60; - gt->gt_logd_secs = 1; - gt->gt_quotad_secs = 5; - gt->gt_quota_simul_sync = 64; - gt->gt_quota_warn_period = 10; - gt->gt_quota_scale_num = 1; - gt->gt_quota_scale_den = 1; - gt->gt_quota_cache_secs = 300; - gt->gt_quota_quantum = 60; - gt->gt_atime_quantum = 3600; - gt->gt_new_files_jdata = 0; - gt->gt_new_files_directio = 0; - gt->gt_max_readahead = 1 << 18; - gt->gt_stall_secs = 600; - gt->gt_complain_secs = 10; - gt->gt_statfs_quantum = 30; - gt->gt_statfs_slow = 0; -} - -/** - * gfs2_check_sb - Check superblock - * @sdp: the filesystem - * @sb: The superblock - * @silent: Don't print a message if the check fails - * - * Checks the version code of the FS is one that we understand how to - * read and that the sizes of the various on-disk structures have not - * changed. - */ - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) -{ - unsigned int x; - - if (sb->sb_magic != GFS2_MAGIC || - sb->sb_type != GFS2_METATYPE_SB) { - if (!silent) - printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); - return -EINVAL; - } - - /* If format numbers match exactly, we're done. */ - - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; - - if (sb->sb_fs_format != GFS2_FORMAT_FS) { - for (x = 0; gfs2_old_fs_formats[x]; x++) - if (gfs2_old_fs_formats[x] == sb->sb_fs_format) - break; - - if (!gfs2_old_fs_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { - for (x = 0; gfs2_old_multihost_formats[x]; x++) - if (gfs2_old_multihost_formats[x] == - sb->sb_multihost_format) - break; - - if (!gfs2_old_multihost_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (!sdp->sd_args.ar_upgrade) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_INFO - "GFS2: Use the \"upgrade\" mount option to upgrade " - "the FS\n"); - printk(KERN_INFO "GFS2: See the manual for more details\n"); - return -EINVAL; - } - - return 0; -} - - -static void end_bio_io_page(struct bio *bio, int error) -{ - struct page *page = bio->bi_private; - - if (!error) - SetPageUptodate(page); - else - printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); - unlock_page(page); -} - -static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) -{ - const struct gfs2_sb *str = buf; - - sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); - sb->sb_type = be32_to_cpu(str->sb_header.mh_type); - sb->sb_format = be32_to_cpu(str->sb_header.mh_format); - sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); - sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); - sb->sb_bsize = be32_to_cpu(str->sb_bsize); - sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); - sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); - sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); - sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); - sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); - - memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); - memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); -} - -/** - * gfs2_read_super - Read the gfs2 super block from disk - * @sdp: The GFS2 super block - * @sector: The location of the super block - * @error: The error code to return + * gfs2_mount_args - Parse mount options + * @args: The structure into which the parsed options will be written + * @options: The options to parse * - * This uses the bio functions to read the super block from disk - * because we want to be 100% sure that we never read cached data. - * A super block is read twice only during each GFS2 mount and is - * never written to by the filesystem. The first time its read no - * locks are held, and the only details which are looked at are those - * relating to the locking protocol. Once locking is up and working, - * the sb is read again under the lock to establish the location of - * the master directory (contains pointers to journals etc) and the - * root directory. - * - * Returns: 0 on success or error + * Return: errno */ -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) +int gfs2_mount_args(struct gfs2_args *args, char *options) { - struct super_block *sb = sdp->sd_vfs; - struct gfs2_sb *p; - struct page *page; - struct bio *bio; - - page = alloc_page(GFP_KERNEL); - if (unlikely(!page)) - return -ENOBUFS; - - ClearPageUptodate(page); - ClearPageDirty(page); - lock_page(page); - - bio = bio_alloc(GFP_KERNEL, 1); - if (unlikely(!bio)) { - __free_page(page); - return -ENOBUFS; - } - - bio->bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; - bio_add_page(bio, page, PAGE_SIZE, 0); - - bio->bi_end_io = end_bio_io_page; - bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); - wait_on_page_locked(page); - bio_put(bio); - if (!PageUptodate(page)) { - __free_page(page); - return -EIO; - } - p = kmap(page); - gfs2_sb_in(&sdp->sd_sb, p); - kunmap(page); - __free_page(page); - return 0; -} - -/** - * gfs2_read_sb - Read super block - * @sdp: The GFS2 superblock - * @gl: the glock for the superblock (assumed to be held) - * @silent: Don't print message if mount fails - * - */ - -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) -{ - u32 hash_blocks, ind_blocks, leaf_blocks; - u32 tmp_blocks; - unsigned int x; - int error; - - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); - if (error) { - if (!silent) - fs_err(sdp, "can't read superblock\n"); - return error; - } - - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - return error; - - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode)) / sizeof(u64); - sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / sizeof(u64); - sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); - sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; - sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; - sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); - sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / - sizeof(struct gfs2_quota_change); - - /* Compute maximum reservation required to add a entry to a directory */ - - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), - sdp->sd_jbsize); - - ind_blocks = 0; - for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { - tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); - ind_blocks += tmp_blocks; - } - - leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; - - sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; + int rv; - sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; + /* Split the options into tokens with the "," character and + process them */ - space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_heightsize[x - 1] || m) + while (1) { + o = strsep(&options, ","); + if (o == NULL) break; - sdp->sd_heightsize[x] = space; - } - sdp->sd_max_height = x; - gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); - - sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_jheightsize[x - 1] || m) + if (*o == '\0') + continue; + + token = match_token(o, tokens, tmp); + switch (token) { + case Opt_lockproto: + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); break; - sdp->sd_jheightsize[x] = space; - } - sdp->sd_max_jheight = x; - gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); - - return 0; -} - -/** - * gfs2_jindex_hold - Grab a lock on the jindex - * @sdp: The GFS2 superblock - * @ji_gh: the holder for the jindex glock - * - * This is very similar to the gfs2_rindex_hold() function, except that - * in general we hold the jindex lock for longer periods of time and - * we grab it far less frequently (in general) then the rgrp lock. - * - * Returns: errno - */ - -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) -{ - struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); - struct qstr name; - char buf[20]; - struct gfs2_jdesc *jd; - int error; - - name.name = buf; - - mutex_lock(&sdp->sd_jindex_mutex); - - for (;;) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); - if (error) + case Opt_locktable: + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); break; - - name.len = sprintf(buf, "journal%u", sdp->sd_journals); - name.hash = gfs2_disk_hash(name.name, name.len); - - error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); - if (error == -ENOENT) { - error = 0; + case Opt_hostdata: + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); break; - } - - gfs2_glock_dq_uninit(ji_gh); - - if (error) + case Opt_spectator: + args->ar_spectator = 1; break; - - error = -ENOMEM; - jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); - if (!jd) + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ break; - - INIT_LIST_HEAD(&jd->extent_list); - jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); - if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { - if (!jd->jd_inode) - error = -ENOENT; - else - error = PTR_ERR(jd->jd_inode); - kfree(jd); + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); + return -EINVAL; + } + args->ar_debug = 1; + break; + case Opt_nodebug: + args->ar_debug = 0; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = 1; + break; + case Opt_noacl: + args->ar_posix_acl = 0; + break; + case Opt_quota_off: + case Opt_noquota: + args->ar_quota = GFS2_QUOTA_OFF; + break; + case Opt_quota_account: + args->ar_quota = GFS2_QUOTA_ACCOUNT; + break; + case Opt_quota_on: + case Opt_quota: + args->ar_quota = GFS2_QUOTA_ON; break; + case Opt_suiddir: + args->ar_suiddir = 1; + break; + case Opt_nosuiddir: + args->ar_suiddir = 0; + break; + case Opt_data_writeback: + args->ar_data = GFS2_DATA_WRITEBACK; + break; + case Opt_data_ordered: + args->ar_data = GFS2_DATA_ORDERED; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + pr_warn("commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_quantum: + rv = match_int(&tmp[0], &args->ar_statfs_quantum); + if (rv || args->ar_statfs_quantum < 0) { + pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_quota_quantum: + rv = match_int(&tmp[0], &args->ar_quota_quantum); + if (rv || args->ar_quota_quantum <= 0) { + pr_warn("quota_quantum mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_statfs_percent: + rv = match_int(&tmp[0], &args->ar_statfs_percent); + if (rv || args->ar_statfs_percent < 0 || + args->ar_statfs_percent > 100) { + pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; + case Opt_barrier: + args->ar_nobarrier = 0; + break; + case Opt_nobarrier: + args->ar_nobarrier = 1; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = 1; + break; + case Opt_norgrplvb: + args->ar_rgrplvb = 0; + break; + case Opt_error: + default: + pr_warn("invalid mount option: %s\n", o); + return -EINVAL; } - - spin_lock(&sdp->sd_jindex_spin); - jd->jd_jid = sdp->sd_journals++; - list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); - spin_unlock(&sdp->sd_jindex_spin); } - mutex_unlock(&sdp->sd_jindex_mutex); - - return error; + return 0; } /** @@ -417,9 +295,8 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) void gfs2_jindex_free(struct gfs2_sbd *sdp) { - struct list_head list, *head; + struct list_head list; struct gfs2_jdesc *jd; - struct gfs2_journal_extent *jext; spin_lock(&sdp->sd_jindex_spin); list_add(&list, &sdp->sd_jindex_list); @@ -429,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) while (!list_empty(&list)) { jd = list_entry(list.next, struct gfs2_jdesc, jd_list); - head = &jd->extent_list; - while (!list_empty(head)) { - jext = list_entry(head->next, - struct gfs2_journal_extent, - extent_list); - list_del(&jext->extent_list); - kfree(jext); - } + gfs2_free_journal_extents(jd); list_del(&jd->jd_list); iput(jd->jd_inode); kfree(jd); @@ -472,59 +342,49 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) return jd; } -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +int gfs2_jdesc_check(struct gfs2_jdesc *jd) { - struct gfs2_jdesc *jd; - - spin_lock(&sdp->sd_jindex_spin); - jd = jdesc_find_i(&sdp->sd_jindex_list, jid); - if (jd) - jd->jd_dirty = 1; - spin_unlock(&sdp->sd_jindex_spin); -} + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd; - int found = 0; + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + return -EIO; - spin_lock(&sdp->sd_jindex_spin); + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_dirty) { - jd->jd_dirty = 0; - found = 1; - break; - } + if (gfs2_write_alloc_required(ip, 0, size)) { + gfs2_consist_inode(ip); + return -EIO; } - spin_unlock(&sdp->sd_jindex_spin); - - if (!found) - jd = NULL; - return jd; + return 0; } -int gfs2_jdesc_check(struct gfs2_jdesc *jd) +static int init_threads(struct gfs2_sbd *sdp) { - struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - int ar; - int error; + struct task_struct *p; + int error = 0; - if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || - (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { - gfs2_consist_inode(ip); - return -EIO; + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; } - jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + sdp->sd_logd_process = p; - error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); - if (!error && ar) { - gfs2_consist_inode(ip); - error = -EIO; + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; } + sdp->sd_quotad_process = p; + return 0; +fail: + kthread_stop(sdp->sd_logd_process); return error; } @@ -539,14 +399,19 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder t_gh; + struct gfs2_holder thaw_gh; struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = init_threads(sdp); if (error) return error; + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &thaw_gh); + if (error) + goto fail_threads; + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -569,51 +434,20 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); return 0; fail: - t_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&t_gh); - + thaw_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&thaw_gh); +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); return error; } -/** - * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one - * @sdp: the filesystem - * - * Returns: errno - */ - -int gfs2_make_fs_ro(struct gfs2_sbd *sdp) -{ - struct gfs2_holder t_gh; - int error; - - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); - - gfs2_quota_cleanup(sdp); - - return error; -} - -static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; @@ -682,27 +516,62 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, { struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct buffer_head *l_bh; + s64 x, y; + int need_sync = 0; int error; error = gfs2_meta_inode_buffer(l_ip, &l_bh); if (error) return; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + if (sdp->sd_args.ar_statfs_percent) { + x = 100 * l_sc->sc_free; + y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; + if (x >= y || x <= -y) + need_sync = 1; + } spin_unlock(&sdp->sd_statfs_spin); brelse(l_bh); + if (need_sync) + gfs2_wake_up_statfs(sdp); +} + +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_meta(l_ip->i_gl, l_bh); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_meta(m_ip->i_gl, m_bh); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); } -int gfs2_statfs_sync(struct gfs2_sbd *sdp) +int gfs2_statfs_sync(struct super_block *sb, int type) { + struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; @@ -737,19 +606,8 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp) if (error) goto out_bh2; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - - spin_lock(&sdp->sd_statfs_spin); - m_sc->sc_total += l_sc->sc_total; - m_sc->sc_free += l_sc->sc_free; - m_sc->sc_dinodes += l_sc->sc_dinodes; - memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), - 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + update_statfs(sdp, m_bh, l_bh); + sdp->sd_statfs_force_sync = 0; gfs2_trans_end(sdp); @@ -762,35 +620,389 @@ out: return error; } +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + /** - * gfs2_statfs_i - Do a statfs + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * @state: the state to put the transaction lock into + * @t_gh: the hold on the transaction lock + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, + struct gfs2_holder *freeze_gh) +{ + struct gfs2_inode *ip; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header_host lh; + struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode); + int error; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, + &sdp->sd_freeze_root_gh); + if (error) + return error; + atomic_set(&sdp->sd_frozen_root, 1); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, freeze_gh); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (error) + gfs2_glock_dq_uninit(freeze_gh); + +out: + while (!list_empty(&list)) { + lfcc = list_entry(list.next, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + if (error) { + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); + } + return error; +} + +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(ip->i_inode.i_mode); + str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); + str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); + str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); + str->di_generation = cpu_to_be64(ip->i_generation); + + str->di_flags = cpu_to_be32(ip->i_diskflags); + str->di_height = cpu_to_be16(ip->i_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(ip->i_depth); + str->di_entries = cpu_to_be32(ip->i_entries); + + str->di_eattr = cpu_to_be64(ip->i_eattr); + str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); +} + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @wbc: The writeback control structure + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + struct backing_dev_info *bdi = metamapping->backing_dev_info; + int ret = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); + if (wbc->sync_mode == WB_SYNC_ALL) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + return ret; +} + +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and transaction lock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + return; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + return; + } + need_unlock = 1; + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + } + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) + gfs2_glock_dq_uninit(&gh); +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one * @sdp: the filesystem - * @sg: the sg structure * * Returns: errno */ -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder thaw_gh; + int error; - spin_lock(&sdp->sd_statfs_spin); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, + &thaw_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; - *sc = *m_sc; - sc->sc_total += l_sc->sc_total; - sc->sc_free += l_sc->sc_free; - sc->sc_dinodes += l_sc->sc_dinodes; + down_write(&sdp->sd_log_flush_lock); + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + up_write(&sdp->sd_log_flush_lock); - spin_unlock(&sdp->sd_statfs_spin); + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); - if (sc->sc_free < 0) - sc->sc_free = 0; - if (sc->sc_free > sc->sc_total) - sc->sc_free = sc->sc_total; - if (sc->sc_dinodes < 0) - sc->sc_dinodes = 0; + flush_workqueue(gfs2_delete_workqueue); + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); + + gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); + + if (thaw_gh.gh_gl) + gfs2_glock_dq_uninit(&thaw_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + struct gfs2_jdesc *jd; + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_jindex); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_freeze_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_quota_sync(sb, -1); + if (wait && sdp && !atomic_read(&sdp->sd_log_freeze)) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + return 0; +} +/** + * gfs2_freeze - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return -EINVAL; + + for (;;) { + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + return 0; +} + +/** + * gfs2_unfreeze - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_unfreeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); return 0; } @@ -807,8 +1019,8 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, { gfs2_rgrp_verify(rgd); sc->sc_total += rgd->rd_data; - sc->sc_free += rgd->rd_rg.rg_free; - sc->sc_dinodes += rgd->rd_rg.rg_dinodes; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; return 0; } @@ -825,9 +1037,8 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd, * Returns: errno */ -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) { - struct gfs2_holder ri_gh; struct gfs2_rgrpd *rgd_next; struct gfs2_holder *gha, *gh; unsigned int slots = 64; @@ -840,10 +1051,6 @@ int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) if (!gha) return -ENOMEM; - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - rgd_next = gfs2_rgrpd_get_first(sdp); for (;;) { @@ -886,134 +1093,542 @@ int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) yield(); } - gfs2_glock_dq_uninit(&ri_gh); - -out: kfree(gha); return error; } -struct lfcc { - struct list_head list; - struct gfs2_holder gh; -}; +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} /** - * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all - * journals are clean - * @sdp: the file system - * @state: the state to put the transaction lock into - * @t_gh: the hold on the transaction lock + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) * * Returns: errno */ -static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *t_gh) +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) { - struct gfs2_inode *ip; - struct gfs2_holder ji_gh; - struct gfs2_jdesc *jd; - struct lfcc *lfcc; - LIST_HEAD(list); - struct gfs2_log_header_host lh; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; int error; - error = gfs2_jindex_hold(sdp, &ji_gh); + sync_filesystem(sb); + + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_logd_secs; + args.ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + args.ar_statfs_quantum = 0; + else + args.ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); + error = gfs2_mount_args(&args, data); if (error) return error; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); - if (!lfcc) { - error = -ENOMEM; - goto out; - } - ip = GFS2_I(jd->jd_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); - if (error) { - kfree(lfcc); - goto out; - } - list_add(&lfcc->list, &list); - } + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, - LM_FLAG_PRIORITY | GL_NOCACHE, - t_gh); + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - error = gfs2_jdesc_check(jd); + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else + error = gfs2_make_fs_rw(sdp); if (error) + return error; + } + + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = args.ar_commit; + gt->gt_quota_quantum = args.ar_quota_quantum; + if (args.ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = args.ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then its because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static int gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + return generic_drop_inode(inode); +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @root: root of this (sub)tree + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct gfs2_sbd *sdp = root->d_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + int val; + + if (is_ancestor(root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; break; - error = gfs2_find_jhead(jd, &lh); - if (error) + case GFS2_QUOTA_ACCOUNT: + state = "account"; break; - if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { - error = -EBUSY; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; break; } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_printf(s, ",discard"); + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) + seq_printf(s, ",commit=%d", val); + val = sdp->sd_tune.gt_statfs_quantum; + if (val != 30) + seq_printf(s, ",statfs_quantum=%d", val); + else if (sdp->sd_tune.gt_statfs_slow) + seq_puts(s, ",statfs_quantum=0"); + val = sdp->sd_tune.gt_quota_quantum; + if (val != 60) + seq_printf(s, ",quota_quantum=%d", val); + if (args->ar_statfs_percent) + seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + seq_printf(s, ",nobarrier"); + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_printf(s, ",demote_interface_used"); + if (args->ar_rgrplvb) + seq_printf(s, ",rgrplvb"); + return 0; +} + +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +static int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; } + error = gfs2_rindex_update(sdp); if (error) - gfs2_glock_dq_uninit(t_gh); + return error; -out: - while (!list_empty(&list)) { - lfcc = list_entry(list.next, struct lfcc, list); - list_del(&lfcc->list); - gfs2_glock_dq_uninit(&lfcc->gh); - kfree(lfcc); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_qs; } - gfs2_glock_dq_uninit(&ji_gh); + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_qs; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&gh); +out_qs: + gfs2_quota_unhold(ip); return error; } /** - * gfs2_freeze_fs - freezes the file system - * @sdp: the file system + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict * - * This function flushes data and meta data for all machines by - * aquiring the transaction log exclusively. All journals are - * ensured to be in a clean state as well. + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 * - * Returns: errno + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. */ -int gfs2_freeze_fs(struct gfs2_sbd *sdp) +static void gfs2_evict_inode(struct inode *inode) { - int error = 0; + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } - mutex_lock(&sdp->sd_freeze_lock); + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) + goto out; - if (!sdp->sd_freeze_count++) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + /* Must not read inode block until block type has been verified */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); + if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (error) - sdp->sd_freeze_count--; + goto out_truncate; } - mutex_unlock(&sdp->sd_freeze_lock); + if (test_bit(GIF_INVALID, &ip->i_flags)) { + error = gfs2_inode_refresh(ip); + if (error) + goto out_truncate; + } - return error; -} + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; -/** - * gfs2_unfreeze_fs - unfreezes the file system - * @sdp: the file system - * - * This function allows the file system to proceed by unlocking - * the exclusively held transaction lock. Other GFS2 nodes are - * now free to acquire the lock shared and go on with their lives. - * - */ + /* Case 1 starts here */ + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + goto out_unlock; -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) +out_truncate: + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + filemap_fdatawrite(metamapping); + filemap_fdatawait(metamapping); + } + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + + /* Case 2 starts here */ + error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (error) + goto out_unlock; + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + gfs2_trans_end(sdp); + +out_unlock: + /* Error path for case 1 */ + if (gfs2_rs_active(ip->i_res)) + gfs2_rs_deltree(ip->i_res); + + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq(&ip->i_iopen_gh); + } + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error && error != GLR_TRYFAILED && error != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", error); +out: + /* Case 3 starts here */ + truncate_inode_pages_final(&inode->i_data); + gfs2_rs_delete(ip, NULL); + gfs2_ordered_del_inode(ip); + clear_inode(inode); + gfs2_dir_hash_inval(ip); + ip->i_gl->gl_object = NULL; + flush_delayed_work(&ip->i_gl->gl_work); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) { - mutex_lock(&sdp->sd_freeze_lock); + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + ip->i_rgd = NULL; + ip->i_res = NULL; + } + return &ip->i_inode; +} - if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); +static void gfs2_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(gfs2_inode_cachep, inode); +} - mutex_unlock(&sdp->sd_freeze_lock); +static void gfs2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, gfs2_i_callback); } +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, + .evict_inode = gfs2_evict_inode, + .put_super = gfs2_put_super, + .sync_fs = gfs2_sync_fs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 60a870e430b..90e3322ffa1 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -10,13 +10,11 @@ #ifndef __SUPER_DOT_H__ #define __SUPER_DOT_H__ +#include <linux/fs.h> +#include <linux/dcache.h> #include "incore.h" -void gfs2_tune_init(struct gfs2_tune *gt); - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); +extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { @@ -27,29 +25,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh); -void gfs2_jindex_free(struct gfs2_sbd *sdp); - -struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid); -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp); -int gfs2_jdesc_check(struct gfs2_jdesc *jd); - -int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, - struct gfs2_inode **ipp); - -int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -int gfs2_make_fs_ro(struct gfs2_sbd *sdp); - -int gfs2_statfs_init(struct gfs2_sbd *sdp); -void gfs2_statfs_change(struct gfs2_sbd *sdp, - s64 total, s64 free, s64 dinodes); -int gfs2_statfs_sync(struct gfs2_sbd *sdp); -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); - -int gfs2_freeze_fs(struct gfs2_sbd *sdp); -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); + +extern int gfs2_mount_args(struct gfs2_args *args, char *data); + +extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); + +extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); +extern int gfs2_statfs_init(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); +extern int gfs2_statfs_sync(struct super_block *sb, int type); + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern const struct dentry_operations gfs2_dops; +extern const struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index eaa3b7b2f99..3ab566ba569 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -7,28 +7,57 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/kobject.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> #include <asm/uaccess.h> +#include <linux/gfs2_ondisk.h> +#include <linux/genhd.h> #include "gfs2.h" #include "incore.h" -#include "lm.h" #include "sys.h" #include "super.h" #include "glock.h" #include "quota.h" #include "util.h" +#include "glops.h" +#include "recovery.h" -char *gfs2_sys_margs; -spinlock_t gfs2_sys_margs_lock; +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static const struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { @@ -41,41 +70,60 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); } -static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +static int gfs2_uuid_valid(const u8 *uuid) { - unsigned int count; + int i; - mutex_lock(&sdp->sd_freeze_lock); - count = sdp->sd_freeze_count; - mutex_unlock(&sdp->sd_freeze_lock); + for (i = 0; i < 16; i++) { + if (uuid[i]) + return 1; + } + return 0; +} + +static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + buf[0] = '\0'; + if (!gfs2_uuid_valid(uuid)) + return 0; + return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; - return snprintf(buf, PAGE_SIZE, "%u\n", count); + return snprintf(buf, PAGE_SIZE, "%u\n", frozen); } static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int error = 0; + int error; int n = simple_strtol(buf, NULL, 0); if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; switch (n) { case 0: - gfs2_unfreeze_fs(sdp); + error = thaw_super(sdp->sd_vfs); break; case 1: - error = gfs2_freeze_fs(sdp); + error = freeze_super(sdp->sd_vfs); break; default: - ret = -EINVAL; + return -EINVAL; } - if (error) + if (error) { fs_warn(sdp, "freeze %d error %d", n, error); + return error; + } - return ret; + return len; } static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) @@ -87,14 +135,13 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: withdrawing from cluster at user's request\n", - sdp->sd_fsname); + gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); + return len; } @@ -102,24 +149,12 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_statfs_sync(sdp); - return len; -} - -static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (simple_strtol(buf, NULL, 0) != 1) - return -EINVAL; - - gfs2_gl_hash_clear(sdp, NO_WAIT); + gfs2_statfs_sync(sdp->sd_vfs, 0); return len; } @@ -127,217 +162,359 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp); + gfs2_quota_sync(sdp->sd_vfs, 0); return len; } static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; + int error; u32 id; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; id = simple_strtoul(buf, NULL, 0); - gfs2_quota_refresh(sdp, 1, id); - return len; + qid = make_kqid(current_user_ns(), USRQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; } static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + struct kqid qid; + int error; u32 id; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; id = simple_strtoul(buf, NULL, 0); - gfs2_quota_refresh(sdp, 0, id); + qid = make_kqid(current_user_ns(), GRPQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; +} + +static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct gfs2_glock *gl; + const struct gfs2_glock_operations *glops; + unsigned int glmode; + unsigned int gltype; + unsigned long long glnum; + char mode[16]; + int rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, + mode); + if (rv != 3) + return -EINVAL; + + if (strcmp(mode, "EX") == 0) + glmode = LM_ST_UNLOCKED; + else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0)) + glmode = LM_ST_DEFERRED; + else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0)) + glmode = LM_ST_SHARED; + else + return -EINVAL; + + if (gltype > LM_TYPE_JOURNAL) + return -EINVAL; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK) + glops = &gfs2_freeze_glops; + else + glops = gfs2_glops_list[gltype]; + if (glops == NULL) + return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); + rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); + if (rv) + return rv; + gfs2_glock_cb(gl, glmode); + gfs2_glock_put(gl); return len; } -struct gfs2_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; #define GFS2_ATTR(name, mode, show, store) \ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) GFS2_ATTR(id, 0444, id_show, NULL); GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(uuid, 0444, uuid_show, NULL); GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); -GFS2_ATTR(shrink, 0200, NULL, shrink_store); GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); +GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, &gfs2_attr_fsname.attr, + &gfs2_attr_uuid.attr, &gfs2_attr_freeze.attr, - &gfs2_attr_shrink.attr, &gfs2_attr_withdraw.attr, &gfs2_attr_statfs_sync.attr, &gfs2_attr_quota_sync.attr, &gfs2_attr_quota_refresh_user.attr, &gfs2_attr_quota_refresh_group.attr, + &gfs2_attr_demote_rq.attr, NULL, }; -static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static void gfs2_sbd_release(struct kobject *kobj) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->show ? a->show(sdp, buf) : 0; -} -static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->store ? a->store(sdp, buf, len) : len; + kfree(sdp); } -static struct sysfs_ops gfs2_attr_ops = { - .show = gfs2_attr_show, - .store = gfs2_attr_store, -}; - static struct kobj_type gfs2_ktype = { + .release = gfs2_sbd_release, .default_attrs = gfs2_attrs, .sysfs_ops = &gfs2_attr_ops, }; -static struct kset *gfs2_kset; /* - * display struct lm_lockstruct fields + * lock_module. Originally from lock_dlm */ -struct lockstruct_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; +static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf) +{ + const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops; + return sprintf(buf, "%s\n", ops->lm_proto_name); +} -#define LOCKSTRUCT_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \ -} \ -static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) - -LOCKSTRUCT_ATTR(jid, "%u\n"); -LOCKSTRUCT_ATTR(first, "%u\n"); -LOCKSTRUCT_ATTR(lvb_size, "%u\n"); -LOCKSTRUCT_ATTR(flags, "%d\n"); - -static struct attribute *lockstruct_attrs[] = { - &lockstruct_attr_jid.attr, - &lockstruct_attr_first.attr, - &lockstruct_attr_lvb_size.attr, - &lockstruct_attr_flags.attr, - NULL, -}; +static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret; + int val = 0; -/* - * display struct gfs2_args fields - */ + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} -struct args_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; +static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret = len; + int val; -#define ARGS_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \ -} \ -static struct args_attr args_attr_##name = __ATTR_RO(name) - -ARGS_ATTR(lockproto, "%s\n"); -ARGS_ATTR(locktable, "%s\n"); -ARGS_ATTR(hostdata, "%s\n"); -ARGS_ATTR(spectator, "%d\n"); -ARGS_ATTR(ignore_local_fs, "%d\n"); -ARGS_ATTR(localcaching, "%d\n"); -ARGS_ATTR(localflocks, "%d\n"); -ARGS_ATTR(debug, "%d\n"); -ARGS_ATTR(upgrade, "%d\n"); -ARGS_ATTR(num_glockd, "%u\n"); -ARGS_ATTR(posix_acl, "%d\n"); -ARGS_ATTR(quota, "%u\n"); -ARGS_ATTR(suiddir, "%d\n"); -ARGS_ATTR(data, "%d\n"); - -/* one oddball doesn't fit the macro mold */ -static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", - !!test_bit(SDF_NOATIME, &sdp->sd_flags)); -} -static struct args_attr args_attr_noatime = __ATTR_RO(noatime); - -static struct attribute *args_attrs[] = { - &args_attr_lockproto.attr, - &args_attr_locktable.attr, - &args_attr_hostdata.attr, - &args_attr_spectator.attr, - &args_attr_ignore_local_fs.attr, - &args_attr_localcaching.attr, - &args_attr_localflocks.attr, - &args_attr_debug.attr, - &args_attr_upgrade.attr, - &args_attr_num_glockd.attr, - &args_attr_posix_acl.attr, - &args_attr_quota.attr, - &args_attr_suiddir.attr, - &args_attr_data.attr, - &args_attr_noatime.attr, - NULL, -}; + val = simple_strtol(buf, NULL, 0); -/* - * display counters from superblock - */ + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + smp_mb__after_atomic(); + gfs2_glock_thaw(sdp); + } else { + ret = -EINVAL; + } + return ret; +} -struct counters_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + ret = -EINVAL; + return ret; +} + +static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first); +} + +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); +} + +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) +{ + struct gfs2_jdesc *jd; + int rv; + + /* Wait for our primary journal to be initialized */ + wait_for_completion(&sdp->sd_journal_ready); + + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (sdp->sd_jdesc->jd_jid == jid) + goto out; + rv = -ENOENT; + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + rv = gfs2_recover_journal(jd, false); + break; + } +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { + rv = -ESHUTDOWN; + goto out; + } + + rv = gfs2_recover_set(sdp, jid); +out: + return rv ? rv : len; +} + +static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_done); +} + +static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_status); +} + +static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); +} + +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int jid; + int rv; -#define COUNTERS_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, \ - (unsigned int)atomic_read(&sdp->sd_##name)); \ -} \ -static struct counters_attr counters_attr_##name = __ATTR_RO(name) - -COUNTERS_ATTR(glock_count, "%u\n"); -COUNTERS_ATTR(glock_held_count, "%u\n"); -COUNTERS_ATTR(inode_count, "%u\n"); -COUNTERS_ATTR(reclaimed, "%u\n"); - -static struct attribute *counters_attrs[] = { - &counters_attr_glock_count.attr, - &counters_attr_glock_held_count.attr, - &counters_attr_inode_count.attr, - &counters_attr_reclaimed.attr, + rv = sscanf(buf, "%d", &jid); + if (rv != 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; + sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *lock_module_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, NULL, }; @@ -359,7 +536,7 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, unsigned int x, y; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) return -EINVAL; @@ -378,7 +555,7 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, unsigned int x; if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; x = simple_strtoul(buf, NULL, 0); @@ -391,14 +568,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, return len; } -struct tune_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; - #define TUNE_ATTR_3(name, show, store) \ -static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) #define TUNE_ATTR_2(name, store) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ @@ -414,134 +585,120 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -#define TUNE_ATTR_DAEMON(name, process) \ -static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ -{ \ - ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \ - wake_up_process(sdp->sd_##process); \ - return r; \ -} \ -TUNE_ATTR_2(name, name##_store) - -TUNE_ATTR(demote_secs, 0); -TUNE_ATTR(incore_log_blocks, 0); -TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); -TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); TUNE_ATTR(new_files_jdata, 0); -TUNE_ATTR(new_files_directio, 0); -TUNE_ATTR(quota_simul_sync, 1); -TUNE_ATTR(quota_cache_secs, 1); -TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); -TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); -TUNE_ATTR_DAEMON(logd_secs, logd_process); -TUNE_ATTR_DAEMON(quotad_secs, quotad_process); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_demote_secs.attr, - &tune_attr_incore_log_blocks.attr, - &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, - &tune_attr_atime_quantum.attr, &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, - &tune_attr_quota_simul_sync.attr, - &tune_attr_quota_cache_secs.attr, - &tune_attr_stall_secs.attr, &tune_attr_statfs_quantum.attr, - &tune_attr_recoverd_secs.attr, - &tune_attr_logd_secs.attr, - &tune_attr_quotad_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, - &tune_attr_new_files_directio.attr, NULL, }; -static struct attribute_group lockstruct_group = { - .name = "lockstruct", - .attrs = lockstruct_attrs, -}; - -static struct attribute_group counters_group = { - .name = "counters", - .attrs = counters_attrs, -}; - -static struct attribute_group args_group = { - .name = "args", - .attrs = args_attrs, -}; - static struct attribute_group tune_group = { .name = "tune", .attrs = tune_attrs, }; +static struct attribute_group lock_module_group = { + .name = "lock_module", + .attrs = lock_module_attrs, +}; + int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + int sysfs_frees_sdp = 0; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, "%s", sdp->sd_table_name); if (error) - goto fail; - - error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group); - if (error) goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &counters_group); + sysfs_frees_sdp = 1; /* Freeing sdp is now done by sysfs calling + function gfs2_sbd_release. */ + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) - goto fail_lockstruct; + goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &args_group); + error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); if (error) - goto fail_counters; + goto fail_tune; - error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); if (error) - goto fail_args; + goto fail_lock_module; - kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); return 0; -fail_args: - sysfs_remove_group(&sdp->sd_kobj, &args_group); -fail_counters: - sysfs_remove_group(&sdp->sd_kobj, &counters_group); -fail_lockstruct: - sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); +fail_tune: + sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: - kobject_put(&sdp->sd_kobj); -fail: + free_percpu(sdp->sd_lkstats); fs_err(sdp, "error %d adding sysfs files", error); + if (sysfs_frees_sdp) + kobject_put(&sdp->sd_kobj); + else + kfree(sdp); + sb->s_fs_info = NULL; return error; } void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { + sysfs_remove_link(&sdp->sd_kobj, "device"); sysfs_remove_group(&sdp->sd_kobj, &tune_group); - sysfs_remove_group(&sdp->sd_kobj, &args_group); - sysfs_remove_group(&sdp->sd_kobj, &counters_group); - sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } +static int gfs2_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; + + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); + if (gfs2_uuid_valid(uuid)) + add_uevent_var(env, "UUID=%pUB", uuid); + return 0; +} + +static const struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + int gfs2_sys_init(void) { - gfs2_sys_margs = NULL; - spin_lock_init(&gfs2_sys_margs_lock); - gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj); + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); if (!gfs2_kset) return -ENOMEM; return 0; @@ -549,7 +706,6 @@ int gfs2_sys_init(void) void gfs2_sys_uninit(void) { - kfree(gfs2_sys_margs); kset_unregister(gfs2_kset); } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index 1ca8cdac530..79182d6ad6a 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -13,15 +13,13 @@ #include <linux/spinlock.h> struct gfs2_sbd; -/* Allow args to be passed to GFS2 when using an initial ram disk */ -extern char *gfs2_sys_margs; -extern spinlock_t gfs2_sys_margs_lock; - int gfs2_sys_fs_add(struct gfs2_sbd *sdp); void gfs2_sys_fs_del(struct gfs2_sbd *sdp); int gfs2_sys_init(void); void gfs2_sys_uninit(void); +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + #endif /* __SYS_DOT_H__ */ diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h new file mode 100644 index 00000000000..20c007d747a --- /dev/null +++ b/fs/gfs2/trace_gfs2.h @@ -0,0 +1,558 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + +#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_GFS2_H + +#include <linux/tracepoint.h> + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/dlmconstants.h> +#include <linux/gfs2_ondisk.h> +#include <linux/writeback.h> +#include <linux/ktime.h> +#include "incore.h" +#include "glock.h" +#include "rgrp.h" + +#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } +#define glock_trace_name(x) __print_symbolic(x, \ + dlm_state_name(IV), \ + dlm_state_name(NL), \ + dlm_state_name(CR), \ + dlm_state_name(CW), \ + dlm_state_name(PR), \ + dlm_state_name(PW), \ + dlm_state_name(EX)) + +#define block_state_name(x) __print_symbolic(x, \ + { GFS2_BLKST_FREE, "free" }, \ + { GFS2_BLKST_USED, "used" }, \ + { GFS2_BLKST_DINODE, "dinode" }, \ + { GFS2_BLKST_UNLINKED, "unlinked" }) + +#define TRACE_RS_DELETE 0 +#define TRACE_RS_TREEDEL 1 +#define TRACE_RS_INSERT 2 +#define TRACE_RS_CLAIM 3 + +#define rs_func_name(x) __print_symbolic(x, \ + { 0, "del " }, \ + { 1, "tdel" }, \ + { 2, "ins " }, \ + { 3, "clm " }) + +#define show_glock_flags(flags) __print_flags(flags, "", \ + {(1UL << GLF_LOCK), "l" }, \ + {(1UL << GLF_DEMOTE), "D" }, \ + {(1UL << GLF_PENDING_DEMOTE), "d" }, \ + {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ + {(1UL << GLF_DIRTY), "y" }, \ + {(1UL << GLF_LFLUSH), "f" }, \ + {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_REPLY_PENDING), "r" }, \ + {(1UL << GLF_INITIAL), "I" }, \ + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_QUEUED), "q" }, \ + {(1UL << GLF_LRU), "L" }, \ + {(1UL << GLF_OBJECT), "o" }, \ + {(1UL << GLF_BLOCKING), "b" }) + +#ifndef NUMPTY +#define NUMPTY +static inline u8 glock_trace_state(unsigned int state) +{ + switch(state) { + case LM_ST_SHARED: + return DLM_LOCK_PR; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + } + return DLM_LOCK_NL; +} +#endif + +/* Section 1 - Locking + * + * Objectives: + * Latency: Remote demote request to state change + * Latency: Local lock request to state change + * Latency: State change to lock grant + * Correctness: Ordering of local lock state vs. I/O requests + * Correctness: Responses to remote demote requests + */ + +/* General glock state change (DLM lock request completes) */ +TRACE_EVENT(gfs2_glock_state_change, + + TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state), + + TP_ARGS(gl, new_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, new_state ) + __field( u8, dmt_state ) + __field( u8, tgt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->new_state = glock_trace_state(new_state); + __entry->tgt_state = glock_trace_state(gl->gl_target); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + ), + + TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->new_state), + glock_trace_name(__entry->tgt_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) +); + +/* State change -> unlocked, glock is being deallocated */ +TRACE_EVENT(gfs2_glock_put, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + ), + + TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->gltype, (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(DLM_LOCK_IV), + show_glock_flags(__entry->flags)) + +); + +/* Callback (local or remote) requesting lock demotion */ +TRACE_EVENT(gfs2_demote_rq, + + TP_PROTO(const struct gfs2_glock *gl, bool remote), + + TP_ARGS(gl, remote), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, dmt_state ) + __field( unsigned long, flags ) + __field( bool, remote ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + __entry->remote = remote; + ), + + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") + +); + +/* Promotion/grant of a glock */ +TRACE_EVENT(gfs2_promote, + + TP_PROTO(const struct gfs2_holder *gh, int first), + + TP_ARGS(gh, first), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, first ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->first = first; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu promote %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->first ? "first": "other", + glock_trace_name(__entry->state)) +); + +/* Queue/dequeue a lock request */ +TRACE_EVENT(gfs2_glock_queue, + + TP_PROTO(const struct gfs2_holder *gh, int queue), + + TP_ARGS(gh, queue), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, queue ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->queue = queue; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu %squeue %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->queue ? "" : "de", + glock_trace_name(__entry->state)) +); + +/* DLM sends a reply to GFS2 */ +TRACE_EVENT(gfs2_glock_lock_time, + + TP_PROTO(const struct gfs2_glock *gl, s64 tdiff), + + TP_ARGS(gl, tdiff), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, status ) + __field( char, flags ) + __field( s64, tdiff ) + __field( s64, srtt ) + __field( s64, srttvar ) + __field( s64, srttb ) + __field( s64, srttvarb ) + __field( s64, sirt ) + __field( s64, sirtvar ) + __field( s64, dcount ) + __field( s64, qcount ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->status = gl->gl_lksb.sb_status; + __entry->flags = gl->gl_lksb.sb_flags; + __entry->tdiff = tdiff; + __entry->srtt = gl->gl_stats.stats[GFS2_LKS_SRTT]; + __entry->srttvar = gl->gl_stats.stats[GFS2_LKS_SRTTVAR]; + __entry->srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + __entry->srttvarb = gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + __entry->sirt = gl->gl_stats.stats[GFS2_LKS_SIRT]; + __entry->sirtvar = gl->gl_stats.stats[GFS2_LKS_SIRTVAR]; + __entry->dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + __entry->qcount = gl->gl_stats.stats[GFS2_LKS_QCOUNT]; + ), + + TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->status, __entry->flags, + (long long)__entry->tdiff, + (long long)__entry->srtt, + (long long)__entry->srttvar, + (long long)__entry->srttb, + (long long)__entry->srttvarb, + (long long)__entry->sirt, + (long long)__entry->sirtvar, + (long long)__entry->dcount, + (long long)__entry->qcount) +); + +/* Section 2 - Log/journal + * + * Objectives: + * Latency: Log flush time + * Correctness: pin/unpin vs. disk I/O ordering + * Performance: Log usage stats + */ + +/* Pin/unpin a block in the log */ +TRACE_EVENT(gfs2_pin, + + TP_PROTO(const struct gfs2_bufdata *bd, int pin), + + TP_ARGS(bd, pin), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, pin ) + __field( u32, len ) + __field( sector_t, block ) + __field( u64, ino ) + ), + + TP_fast_assign( + __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev; + __entry->pin = pin; + __entry->len = bd->bd_bh->b_size; + __entry->block = bd->bd_bh->b_blocknr; + __entry->ino = bd->bd_gl->gl_name.ln_number; + ), + + TP_printk("%u,%u log %s %llu/%lu inode %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->pin ? "pin" : "unpin", + (unsigned long long)__entry->block, + (unsigned long)__entry->len, + (unsigned long long)__entry->ino) +); + +/* Flushing the log */ +TRACE_EVENT(gfs2_log_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, int start), + + TP_ARGS(sdp, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( u64, log_seq ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->log_seq = sdp->sd_log_sequence; + ), + + TP_printk("%u,%u log flush %s %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start ? "start" : "end", + (unsigned long long)__entry->log_seq) +); + +/* Reserving/releasing blocks in the log */ +TRACE_EVENT(gfs2_log_blocks, + + TP_PROTO(const struct gfs2_sbd *sdp, int blocks), + + TP_ARGS(sdp, blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, blocks ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->blocks = blocks; + ), + + TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks) +); + +/* Writing back the AIL */ +TRACE_EVENT(gfs2_ail_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start), + + TP_ARGS(sdp, wbc, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( int, sync_mode ) + __field( long, nr_to_write ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->sync_mode = wbc->sync_mode; + __entry->nr_to_write = wbc->nr_to_write; + ), + + TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->start ? "start" : "end", + __entry->sync_mode == WB_SYNC_ALL ? "all" : "none", + __entry->nr_to_write) +); + +/* Section 3 - bmap + * + * Objectives: + * Latency: Bmap request time + * Performance: Block allocator tracing + * Correctness: Test of disard generation vs. blocks allocated + */ + +/* Map an extent of blocks, possibly a new allocation */ +TRACE_EVENT(gfs2_bmap, + + TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh, + sector_t lblock, int create, int errno), + + TP_ARGS(ip, bh, lblock, create, errno), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, lblock ) + __field( sector_t, pblock ) + __field( u64, inum ) + __field( unsigned long, state ) + __field( u32, len ) + __field( int, create ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->lblock = lblock; + __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; + __entry->inum = ip->i_no_addr; + __entry->state = bh->b_state; + __entry->len = bh->b_size; + __entry->create = create; + __entry->errno = errno; + ), + + TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->lblock, + (unsigned long)__entry->len, + (unsigned long long)__entry->pblock, + __entry->state, __entry->create ? "create " : "nocreate", + __entry->errno) +); + +/* Keep track of blocks as they are allocated/freed */ +TRACE_EVENT(gfs2_block_alloc, + + TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 block, unsigned len, u8 block_state), + + TP_ARGS(ip, rgd, block, len, block_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, start ) + __field( u64, inum ) + __field( u32, len ) + __field( u8, block_state ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) + ), + + TP_fast_assign( + __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev; + __entry->start = block; + __entry->inum = ip->i_no_addr; + __entry->len = len; + __entry->block_state = block_state; + __entry->rd_addr = rgd->rd_addr; + __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_reserved = rgd->rd_reserved; + ), + + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long)__entry->len, + block_state_name(__entry->block_state), + (unsigned long long)__entry->rd_addr, + __entry->rd_free_clone, (unsigned long)__entry->rd_reserved) +); + +/* Keep track of multi-block reservations as they are allocated/freed */ +TRACE_EVENT(gfs2_rs, + + TP_PROTO(const struct gfs2_blkreserv *rs, u8 func), + + TP_ARGS(rs, func), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_reserved ) + __field( u64, inum ) + __field( u64, start ) + __field( u32, free ) + __field( u8, func ) + ), + + TP_fast_assign( + __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; + __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; + __entry->inum = rs->rs_inum; + __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); + __entry->free = rs->rs_free; + __entry->func = func; + ), + + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long long)__entry->rd_addr, + (unsigned long)__entry->rd_free_clone, + (unsigned long)__entry->rd_reserved, + rs_func_name(__entry->func), (unsigned long)__entry->free) +); + +#endif /* _TRACE_GFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 +#include <trace/define_trace.h> + diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 73e5d92a657..0546ab4e28e 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -7,23 +7,26 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> -#include <linux/gfs2_ondisk.h> #include <linux/kallsyms.h> -#include <linux/lm_interface.h> +#include <linux/gfs2_ondisk.h> #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "inode.h" #include "log.h" #include "lops.h" #include "meta_io.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes) @@ -34,6 +37,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, BUG_ON(current->journal_info); BUG_ON(blocks == 0 && revokes == 0); + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); if (!tr) return -ENOMEM; @@ -42,138 +48,223 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; + tr->tr_alloced = 1; if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); - INIT_LIST_HEAD(&tr->tr_list_buf); - - gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); - - error = gfs2_glock_nq(&tr->tr_t_gh); - if (error) - goto fail_holder_uninit; + INIT_LIST_HEAD(&tr->tr_databuf); + INIT_LIST_HEAD(&tr->tr_buf); - if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - tr->tr_t_gh.gh_flags |= GL_NOCACHE; - error = -EROFS; - goto fail_gunlock; - } + sb_start_intwrite(sdp->sd_vfs); error = gfs2_log_reserve(sdp, tr->tr_reserved); if (error) - goto fail_gunlock; + goto fail; current->journal_info = tr; return 0; -fail_gunlock: - gfs2_glock_dq(&tr->tr_t_gh); - -fail_holder_uninit: - gfs2_holder_uninit(&tr->tr_t_gh); +fail: + sb_end_intwrite(sdp->sd_vfs); kfree(tr); return error; } +static void gfs2_print_trans(const struct gfs2_trans *tr) +{ + pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); + pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke, tr->tr_num_revoke_rm); +} + void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; - + s64 nbuf; BUG_ON(!tr); current->journal_info = NULL; if (!tr->tr_touched) { gfs2_log_release(sdp, tr->tr_reserved); - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (tr->tr_alloced) + kfree(tr); + sb_end_intwrite(sdp->sd_vfs); return; } - if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) { - fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ", - tr->tr_num_buf, tr->tr_blocks); - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); - } - if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) { - fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ", - tr->tr_num_revoke, tr->tr_revokes); - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); - } + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; + nbuf -= tr->tr_num_buf_rm; + nbuf -= tr->tr_num_databuf_rm; + + if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && + (tr->tr_num_revoke <= tr->tr_revokes))) + gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (tr->tr_alloced && !tr->tr_attached) + kfree(tr); + up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + sb_end_intwrite(sdp->sd_vfs); +} + +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh, + const struct gfs2_log_operations *lops) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + bd->bd_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bh->b_private = bd; + return bd; } /** - * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction - * @gl: the glock the buffer belongs to + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer * @bh: The buffer to add - * @meta: True in the case of adding metadata * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ - -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) { + struct gfs2_trans *tr = current->journal_info; struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_bufdata *bd; + if (!gfs2_is_jdata(ip)) { + gfs2_ordered_add_inode(ip); + return; + } + + lock_buffer(bh); + gfs2_log_lock(sdp); bd = bh->b_private; - if (bd) - gfs2_assert(sdp, bd->bd_gl == gl); - else { - gfs2_attach_bufdata(gl, bh, meta); - bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + tr->tr_touched = 1; + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + list_add_tail(&bd->bd_list, &tr->tr_databuf); } - lops_add(sdp, &bd->bd_le); + gfs2_log_unlock(sdp); + unlock_buffer(bh); } -void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - BUG_ON(!list_empty(&bd->bd_le.le_list)); - BUG_ON(!list_empty(&bd->bd_ail_st_list)); - BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - lops_init_le(&bd->bd_le, &gfs2_revoke_lops); - lops_add(sdp, &bd->bd_le); + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list)) + return; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + list_add(&bd->bd_list, &tr->tr_buf); + tr->tr_num_buf_new++; } -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { + + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; - int found = 0; + lock_buffer(bh); gfs2_log_lock(sdp); - - list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) { - if (bd->bd_blkno == blkno) { - list_del_init(&bd->bd_le.le_list); - gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); - sdp->sd_log_num_revoke--; - found = 1; - break; - } + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + unlock_page(bh->b_page); + lock_buffer(bh); + gfs2_log_lock(sdp); } - + gfs2_assert(sdp, bd->bd_gl == gl); + meta_lo_add(sdp, bd); gfs2_log_unlock(sdp); + unlock_buffer(bh); +} - if (found) { - struct gfs2_trans *tr = current->journal_info; - kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke_rm++; - } +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!list_empty(&bd->bd_list)); + gfs2_add_revoke(sdp, bd); + tr->tr_touched = 1; + tr->tr_num_revoke++; } -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) { - lops_add(rgd->rd_sbd, &rgd->rd_le); + struct gfs2_bufdata *bd, *tmp; + struct gfs2_trans *tr = current->journal_info; + unsigned int n = len; + + gfs2_log_lock(sdp); + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) { + if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { + list_del_init(&bd->bd_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + kmem_cache_free(gfs2_bufdata_cachep, bd); + tr->tr_num_revoke_rm++; + if (--n == 0) + break; + } + } + gfs2_log_unlock(sdp); } diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index e826f0dab80..1e6e7da25a1 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -20,19 +20,28 @@ struct gfs2_glock; #define RES_JDATA 1 #define RES_DATA 1 #define RES_LEAF 1 +#define RES_RG_HDR 1 #define RES_RG_BIT 2 #define RES_EATTR 1 #define RES_STATFS 1 #define RES_QUOTA 2 -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) +{ + if (requested < ip->i_rgd->rd_length) + return requested + 1; + return ip->i_rgd->rd_length; +} -void gfs2_trans_end(struct gfs2_sbd *sdp); +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); -void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); -void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 424a0774eda..86d2035ac66 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -7,29 +7,75 @@ * of the GNU General Public License version 2. */ -#include <linux/slab.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/crc32.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> #include <asm/uaccess.h> #include "gfs2.h" #include "incore.h" #include "glock.h" -#include "lm.h" #include "util.h" struct kmem_cache *gfs2_glock_cachep __read_mostly; +struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly; struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; +struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; +struct kmem_cache *gfs2_rsrv_cachep __read_mostly; +mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { - printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", - sdp->sd_fsname); + fs_emerg(sdp, "fatal assertion failed\n"); +} + +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; + va_list args; + struct va_format vaf; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + fs_err(sdp, "%pV", &vaf); + + va_end(args); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); + + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); + } + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); + + return -1; } /** @@ -43,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); dump_stack(); return (me) ? -1 : -2; } @@ -65,17 +110,21 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, gfs2_tune_get(sdp, gt_complain_secs) * HZ)) return -2; - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", + assertion, function, file, line); if (sdp->sd_args.ar_debug) BUG(); else dump_stack(); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + sdp->sd_last_warning = jiffies; return -1; @@ -92,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -111,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: inode = %llu %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); return rv; } @@ -133,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, struct gfs2_sbd *sdp = rgd->rd_sbd; int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: RG = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)rgd->rd_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); return rv; } @@ -154,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (%s)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); return (me) ? -1 : -2; } @@ -175,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); return (me) ? -1 : -2; } @@ -195,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -213,32 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: block = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); return rv; } -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); -} - diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 28938a46cf4..cbdcbdf3961 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -10,20 +10,23 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ -#include "incore.h" - -#define fs_printk(level, fs, fmt, arg...) \ - printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) - -#define fs_info(fs, fmt, arg...) \ - fs_printk(KERN_INFO , fs , fmt , ## arg) +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif -#define fs_warn(fs, fmt, arg...) \ - fs_printk(KERN_WARNING , fs , fmt , ## arg) +#include <linux/mempool.h> -#define fs_err(fs, fmt, arg...) \ - fs_printk(KERN_ERR, fs , fmt , ## arg) +#include "incore.h" +#define fs_emerg(fs, fmt, ...) \ + pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_warn(fs, fmt, ...) \ + pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_err(fs, fmt, ...) \ + pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_info(fs, fmt, ...) \ + pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) void gfs2_assert_i(struct gfs2_sbd *sdp); @@ -41,7 +44,7 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, #define gfs2_assert_withdraw(sdp, assertion) \ ((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ - __FUNCTION__, __FILE__, __LINE__)) + __func__, __FILE__, __LINE__)) int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, @@ -49,51 +52,47 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, #define gfs2_assert_warn(sdp, assertion) \ ((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ - __FUNCTION__, __FILE__, __LINE__)) + __func__, __FILE__, __LINE__)) int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist(sdp) \ -gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__) int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist_inode(ip) \ -gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__) int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist_rgrpd(rgd) \ -gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__) int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *type, const char *function, char *file, unsigned int line); -static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, - struct buffer_head *bh, - const char *function, - char *file, unsigned int line) +static inline int gfs2_meta_check(struct gfs2_sbd *sdp, + struct buffer_head *bh) { struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); - if (unlikely(magic != GFS2_MAGIC)) - return gfs2_meta_check_ii(sdp, bh, "magic number", function, - file, line); + if (unlikely(magic != GFS2_MAGIC)) { + pr_err("Magic number missing at %llu\n", + (unsigned long long)bh->b_blocknr); + return -EIO; + } return 0; } -#define gfs2_meta_check(sdp, bh) \ -gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__) - - int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, u16 type, u16 t, const char *function, @@ -118,7 +117,7 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, } #define gfs2_metatype_check(sdp, bh, type) \ -gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__) +gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__) static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, u16 format) @@ -134,19 +133,24 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line); #define gfs2_io_error(sdp) \ -gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__); +gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line); #define gfs2_io_error_bh(sdp, bh) \ -gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__); +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); extern struct kmem_cache *gfs2_glock_cachep; +extern struct kmem_cache *gfs2_glock_aspace_cachep; extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; +extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; +extern struct kmem_cache *gfs2_rsrv_cachep; +extern mempool_t *gfs2_page_pool; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) @@ -161,8 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) -void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value); +__printf(2, 3) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...); #endif /* __UTIL_DOT_H__ */ - diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c index bee99704ea1..0b81f783f78 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/xattr.c @@ -13,14 +13,13 @@ #include <linux/buffer_head.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> +#include <linux/posix_acl_xattr.h> #include <asm/uaccess.h> #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -39,26 +38,32 @@ * Returns: 1 if the EA should be stuffed */ -static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, unsigned int *size) { - *size = GFS2_EAREQ_SIZE_STUFFED(er); - if (*size <= sdp->sd_jbsize) + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) return 1; - *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); return 0; } -static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) { unsigned int size; - if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + if (dsize > GFS2_EA_MAX_DATA_LEN) return -ERANGE; - ea_calc_size(sdp, er, &size); + ea_calc_size(sdp, nsize, dsize, &size); /* This can only happen with 512 byte blocks */ if (size > sdp->sd_jbsize) @@ -114,11 +119,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { error = ea_foreach_i(ip, bh, ea_call, data); goto out; } @@ -152,7 +157,9 @@ out: } struct ea_find { - struct gfs2_ea_request *ef_er; + int type; + const char *name; + size_t namel; struct gfs2_ea_location *ef_el; }; @@ -161,14 +168,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, void *private) { struct ea_find *ef = private; - struct gfs2_ea_request *er = ef->ef_er; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; - if (ea->ea_type == er->er_type) { - if (ea->ea_name_len == er->er_name_len && - !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { struct gfs2_ea_location *el = ef->ef_el; get_bh(bh); el->el_bh = bh; @@ -181,13 +187,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; } -int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, - struct gfs2_ea_location *el) +static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el) { struct ea_find ef; int error; - ef.ef_er = er; + ef.type = type; + ef.name = name; + ef.namel = strlen(name); ef.ef_el = el; memset(el, 0, sizeof(struct gfs2_ea_location)); @@ -231,6 +239,10 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, unsigned int x; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (GFS2_EA_IS_STUFFED(ea)) return 0; @@ -244,7 +256,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (!blks) return 0; - rgd = gfs2_blk2rgrpd(sdp, bn); + rgd = gfs2_blk2rgrpd(sdp, bn, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; @@ -259,7 +271,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { @@ -277,10 +289,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, } *dataptrs = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, bstart, blen); @@ -301,7 +310,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -317,27 +326,20 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, int leave) { - struct gfs2_alloc *al; int error; - al = gfs2_alloc_get(ip); - - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) - goto out_alloc; + return error; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) - goto out_quota; + goto out_alloc; error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); - gfs2_glock_dq_uninit(&al->al_ri_gh); - -out_quota: gfs2_quota_unhold(ip); out_alloc: - gfs2_alloc_put(ip); return error; } @@ -346,6 +348,20 @@ struct ea_list { unsigned int ei_size; }; +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) @@ -394,29 +410,33 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, } /** - * gfs2_ea_list - - * @ip: - * @er: + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; - if (ip->i_di.di_eattr) { - struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + if (ip->i_eattr) { + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); if (!error) @@ -429,17 +449,18 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) } /** - * ea_get_unstuffed - actually copies the unstuffed data into the - * request buffer + * ea_iter_unstuffed - copies the unstuffed xattr data to/from the + * request buffer * @ip: The GFS2 inode * @ea: The extended attribute header structure - * @data: The data to be copied + * @din: The data to be copied in + * @dout: The data to be copied out (one of din,dout will be NULL) * * Returns: errno */ -static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, - char *data) +static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + const char *din, char *dout) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head **bh; @@ -448,8 +469,10 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); unsigned int x; int error = 0; + unsigned char *pos; + unsigned cp_size; - bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); if (!bh) return -ENOMEM; @@ -478,12 +501,21 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, goto out; } - memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header), - (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + pos = bh[x]->b_data + sizeof(struct gfs2_meta_header); + cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize; - amount -= sdp->sd_jbsize; - data += sdp->sd_jbsize; + if (dout) { + memcpy(dout, pos, cp_size); + dout += sdp->sd_jbsize; + } + if (din) { + gfs2_trans_add_meta(ip->i_gl, bh[x]); + memcpy(pos, din, cp_size); + din += sdp->sd_jbsize; + } + + amount -= sdp->sd_jbsize; brelse(bh[x]); } @@ -492,80 +524,87 @@ out: return error; } -int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data) +static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size) { + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + if (GFS2_EA_IS_STUFFED(el->el_ea)) { - memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); - return 0; - } else - return ea_get_unstuffed(ip, el->el_ea, data); + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data); + if (ret < 0) + return ret; + return len; } -/** - * gfs2_ea_get_i - - * @ip: The GFS2 inode - * @er: The request structure - * - * Returns: actual size of data on success, -errno on error - */ - -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) { struct gfs2_ea_location el; int error; + int len; + char *data; - if (!ip->i_di.di_eattr) - return -ENODATA; - - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); if (error) return error; if (!el.el_ea) - return -ENODATA; + goto out; + if (!GFS2_EA_DATA_LEN(el.el_ea)) + goto out; - if (er->er_data_len) { - if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) - error = -ERANGE; - else - error = gfs2_ea_get_copy(ip, &el, er->er_data); - } - if (!error) - error = GFS2_EA_DATA_LEN(el.el_ea); + len = GFS2_EA_DATA_LEN(el.el_ea); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + error = gfs2_ea_get_copy(ip, &el, data, len); + if (error < 0) + kfree(data); + else + *ppdata = data; +out: brelse(el.el_bh); - return error; } /** - * gfs2_ea_get - - * @ip: The GFS2 inode - * @er: The request structure + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer + * @type: The type of extended attribute * * Returns: actual size of data on success, -errno on error */ - -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct gfs2_holder i_gh; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_location el; int error; - if (!er->er_name_len || - er->er_name_len > GFS2_EA_MAX_NAME_LEN) + if (!ip->i_eattr) + return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; - - error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); - - gfs2_glock_dq_uninit(&i_gh); + if (!el.el_ea) + return -ENODATA; + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else + error = GFS2_EA_DATA_LEN(el.el_ea); + brelse(el.el_bh); return error; } @@ -582,12 +621,16 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea; + unsigned int n = 1; u64 block; + int error; - block = gfs2_alloc_meta(ip); - + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_trans_add_meta(ip->i_gl, *bhp); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); @@ -597,8 +640,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) ea->ea_flags = GFS2_EAFLAG_LAST; ea->ea_num_ptrs = 0; - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); return 0; } @@ -619,6 +661,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct gfs2_ea_request *er) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; ea->ea_data_len = cpu_to_be32(er->er_data_len); ea->ea_name_len = er->er_name_len; @@ -642,15 +685,17 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct buffer_head *bh; u64 block; int mh_size = sizeof(struct gfs2_meta_header); + unsigned int n = 1; - block = gfs2_alloc_meta(ip); - + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, 1); copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : data_len; @@ -679,28 +724,24 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { - struct gfs2_alloc *al; + struct gfs2_alloc_parms ap = { .target = blks }; struct buffer_head *dibh; int error; - al = gfs2_alloc_get(ip); - - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) - goto out; + return error; - error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + error = gfs2_quota_lock_check(ip); if (error) - goto out_gunlock_q; - - al->al_requested = blks; + return error; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_gunlock_q; error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), - blks + al->al_rgd->rd_length + + blks + gfs2_rg_blocks(ip, blks) + RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipres; @@ -711,14 +752,8 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == - (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -729,8 +764,6 @@ out_ipres: gfs2_inplace_release(ip); out_gunlock_q: gfs2_quota_unlock(ip); -out: - gfs2_alloc_put(ip); return error; } @@ -744,7 +777,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - ip->i_di.di_eattr = bh->b_blocknr; + ip->i_eattr = bh->b_blocknr; error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); brelse(bh); @@ -760,15 +793,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, * Returns: errno */ -static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) { + struct gfs2_ea_request er; unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; unsigned int blks = 1; - if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) - blks += DIV_ROUND_UP(er->er_data_len, jbsize); + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); } static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) @@ -795,7 +836,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_header *prev = el->el_prev; u32 len; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (!prev || !GFS2_EA_IS_STUFFED(ea)) { ea->ea_type = GFS2_EATYPE_UNUSED; @@ -833,7 +874,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -846,14 +887,8 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; - - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: @@ -868,7 +903,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip, struct gfs2_ea_header *ea = es->es_ea; int error; - gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + gfs2_trans_add_meta(ip->i_gl, es->es_bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -892,7 +927,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, int stuffed; int error; - stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); if (ea->ea_type == GFS2_EATYPE_UNUSED) { if (GFS2_EA_REC_LEN(ea) < size) @@ -938,10 +974,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, int error; int mh_size = sizeof(struct gfs2_meta_header); - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -963,23 +999,24 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out; } - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); } else { u64 blk; - - blk = gfs2_alloc_meta(ip); - + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); + if (error) + return error; + gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(indbh, mh_size); eablk = (__be64 *)(indbh->b_data + mh_size); - *eablk = cpu_to_be64(ip->i_di.di_eattr); - ip->i_di.di_eattr = blk; - ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; - ip->i_di.di_blocks++; - gfs2_set_inode_blocks(&ip->i_inode); + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; + gfs2_add_inode_blocks(&ip->i_inode, 1); eablk++; } @@ -1002,15 +1039,22 @@ out: return error; } -static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, - struct gfs2_ea_location *el) +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) { + struct gfs2_ea_request er; struct ea_set es; unsigned int blks = 2; int error; + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + memset(&es, 0, sizeof(struct ea_set)); - es.es_er = er; + es.es_er = &er; es.es_el = el; error = ea_foreach(ip, ea_set_simple, &es); @@ -1019,12 +1063,12 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; - if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) - blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); } static int ea_set_remove_unstuffed(struct gfs2_inode *ip, @@ -1036,75 +1080,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip, GFS2_EA2NEXT(el->el_prev) == el->el_ea); } - return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); -} - -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_ea_location el; - int error; - - if (!ip->i_di.di_eattr) { - if (er->er_flags & XATTR_REPLACE) - return -ENODATA; - return ea_init(ip, er); - } - - error = gfs2_ea_find(ip, er, &el); - if (error) - return error; - - if (el.el_ea) { - if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { - brelse(el.el_bh); - return -EPERM; - } - - error = -EEXIST; - if (!(er->er_flags & XATTR_CREATE)) { - int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); - error = ea_set_i(ip, er, &el); - if (!error && unstuffed) - ea_set_remove_unstuffed(ip, &el); - } - - brelse(el.el_bh); - } else { - error = -ENODATA; - if (!(er->er_flags & XATTR_REPLACE)) - error = ea_set_i(ip, er, NULL); - } - - return error; -} - -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - error = ea_check_size(GFS2_SB(&ip->i_inode), er); - if (error) - return error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - if (IS_IMMUTABLE(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); } static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) @@ -1118,7 +1094,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (error) return error; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (prev) { u32 len; @@ -1128,13 +1104,14 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; - } else + } else { ea->ea_type = GFS2_EATYPE_UNUSED; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1144,15 +1121,28 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) return error; } -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @ip: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) { struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return -ENODATA; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) @@ -1161,8 +1151,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (GFS2_EA_IS_STUFFED(el.el_ea)) error = ea_remove_stuffed(ip, &el); else - error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, - 0); + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); brelse(el.el_bh); @@ -1170,131 +1159,128 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) } /** - * gfs2_ea_remove - sets (or creates or replaces) an extended attribute - * @ip: pointer to the inode of the target file - * @er: request information + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace + * @type: The type of the extended attribute * - * Returns: errno + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure */ -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) { - struct gfs2_holder i_gh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); int error; - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; + + if (value == NULL) + return gfs2_xattr_remove(ip, type, name); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } - gfs2_glock_dq_uninit(&i_gh); + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); return error; } +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(dentry->d_inode, name, value, + size, flags, type); +} + + static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, char *data) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head **bh; unsigned int amount = GFS2_EA_DATA_LEN(ea); unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); - __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); - unsigned int x; - int error; - - bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); - if (!bh) - return -ENOMEM; + int ret; - error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); - if (error) - goto out; - - for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, - bh + x); - if (error) { - while (x--) - brelse(bh[x]); - goto fail; - } - dataptrs++; - } + ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (ret) + return ret; - for (x = 0; x < nptrs; x++) { - error = gfs2_meta_wait(sdp, bh[x]); - if (error) { - for (; x < nptrs; x++) - brelse(bh[x]); - goto fail; - } - if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { - for (; x < nptrs; x++) - brelse(bh[x]); - error = -EIO; - goto fail; - } - - gfs2_trans_add_bh(ip->i_gl, bh[x], 1); - - memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data, - (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); - - amount -= sdp->sd_jbsize; - data += sdp->sd_jbsize; - - brelse(bh[x]); - } - -out: - kfree(bh); - return error; - -fail: + ret = gfs2_iter_unstuffed(ip, ea, data, NULL); gfs2_trans_end(sdp); - kfree(bh); - return error; + + return ret; } -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data) +int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct buffer_head *dibh; + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; int error; - if (GFS2_EA_IS_STUFFED(el->el_ea)) { - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); - if (error) - return error; - - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); - memcpy(GFS2_EA2DATA(el->el_ea), data, - GFS2_EA_DATA_LEN(el->el_ea)); - } else - error = ea_acl_chmod_unstuffed(ip, el->el_ea, data); - + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); if (error) return error; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - error = inode_setattr(&ip->i_inode, attr); - gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); + if (GFS2_EA_IS_STUFFED(el.el_ea)) { + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_meta(ip->i_gl, el.el_bh); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { + error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); } - gfs2_trans_end(GFS2_SB(&ip->i_inode)); + brelse(el.el_bh); + if (error) + return error; + error = gfs2_setattr_simple(inode, attr); + gfs2_trans_end(sdp); return error; } @@ -1311,9 +1297,13 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) unsigned int x; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -1336,18 +1326,18 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) blen++; else { if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); bstart = bn; blen = 1; } blks++; } if (bstart) - gfs2_rlist_add(sdp, &rlist, bstart); + gfs2_rlist_add(ip, &rlist, bstart); else goto out; - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); for (x = 0; x < rlist.rl_rgrps; x++) { struct gfs2_rgrpd *rgd; @@ -1364,7 +1354,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; @@ -1387,19 +1377,16 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) } *eablk = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + gfs2_add_inode_blocks(&ip->i_inode, -1); } if (bstart) gfs2_free_meta(ip, bstart, blen); - ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1418,19 +1405,22 @@ out: static int ea_dealloc_block(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; struct buffer_head *dibh; + struct gfs2_holder gh; int error; - rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); + error = gfs2_rindex_update(sdp); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) return error; @@ -1439,17 +1429,14 @@ static int ea_dealloc_block(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_free_meta(ip, ip->i_di.di_eattr, 1); + gfs2_free_meta(ip, ip->i_eattr, 1); - ip->i_di.di_eattr = 0; - if (!ip->i_di.di_blocks) - gfs2_consist_inode(ip); - ip->i_di.di_blocks--; - gfs2_set_inode_blocks(&ip->i_inode); + ip->i_eattr = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1457,7 +1444,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&gh); return error; } @@ -1470,37 +1457,52 @@ out_gunlock: int gfs2_ea_dealloc(struct gfs2_inode *ip) { - struct gfs2_alloc *al; int error; - al = gfs2_alloc_get(ip); - - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); if (error) - goto out_alloc; + return error; - error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) - goto out_quota; + return error; error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) - goto out_rindex; + goto out_quota; - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) - goto out_rindex; + goto out_quota; } error = ea_dealloc_block(ip); -out_rindex: - gfs2_glock_dq_uninit(&al->al_ri_gh); out_quota: gfs2_quota_unhold(ip); -out_alloc: - gfs2_alloc_put(ip); return error; } +static const struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +static const struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; + diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h index c82dbe01d71..d392f8358f2 100644 --- a/fs/gfs2/eattr.h +++ b/fs/gfs2/xattr.h @@ -19,7 +19,7 @@ struct iattr; #define GFS2_EA_SIZE(ea) \ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ - (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) @@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ #define GFS2_EAREQ_SIZE_STUFFED(er) \ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) -#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ -ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ - sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) - #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) @@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ #define GFS2_EA_BH2FIRST(bh) \ ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) -#define GFS2_ERF_MODE 0x80000000 - struct gfs2_ea_request { const char *er_name; char *er_data; unsigned int er_name_len; unsigned int er_data_len; unsigned int er_type; /* GFS2_EATYPE_... */ - int er_flags; - mode_t er_mode; }; struct gfs2_ea_location { @@ -61,40 +53,15 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_dealloc(struct gfs2_inode *ip); +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -int gfs2_ea_find(struct gfs2_inode *ip, - struct gfs2_ea_request *er, - struct gfs2_ea_location *el); -int gfs2_ea_get_copy(struct gfs2_inode *ip, - struct gfs2_ea_location *el, - char *data); -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data); - -static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) -{ - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - return 5 + ea->ea_name_len + 1; - case GFS2_EATYPE_SYS: - return 7 + ea->ea_name_len + 1; - case GFS2_EATYPE_SECURITY: - return 9 + ea->ea_name_len + 1; - default: - return 0; - } -} +extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ |
