diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-16 12:11:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-16 12:11:57 -0700 |
commit | 300df7dc89cc276377fc020704e34875d5c473b6 (patch) | |
tree | 6b280000815b0562255cecf3da1a8c9597bce702 /fs/ocfs2 | |
parent | 661adc423d70203a56723701aaf255e16950dfdb (diff) | |
parent | 9af0b38ff3f4f79c62dd909405b113bf7c1a23aa (diff) |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2:
ocfs2/net: Use wait_event() in o2net_send_message_vec()
ocfs2: Adjust rightmost path in ocfs2_add_branch.
ocfs2: fdatasync should skip unimportant metadata writeout
ocfs2: Remove redundant gotos in ocfs2_mount_volume()
ocfs2: Add statistics for the checksum and ecc operations.
ocfs2 patch to track delayed orphan scan timer statistics
ocfs2: timer to queue scan of all orphan slots
ocfs2: Correct ordering of ip_alloc_sem and localloc locks for directories
ocfs2: Fix possible deadlock in quota recovery
ocfs2: Fix possible deadlock with quotas in ocfs2_setattr()
ocfs2: Fix lock inversion in ocfs2_local_read_info()
ocfs2: Fix possible deadlock in ocfs2_global_read_dquot()
ocfs2: update comments in masklog.h
ocfs2: Don't printk the error when listing too many xattrs.
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 80 | ||||
-rw-r--r-- | fs/ocfs2/blockcheck.c | 184 | ||||
-rw-r--r-- | fs/ocfs2/blockcheck.h | 29 | ||||
-rw-r--r-- | fs/ocfs2/cluster/masklog.h | 35 | ||||
-rw-r--r-- | fs/ocfs2/cluster/tcp.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 21 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.c | 51 | ||||
-rw-r--r-- | fs/ocfs2/dlmglue.h | 11 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 56 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 111 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 4 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 16 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_lockid.h | 5 | ||||
-rw-r--r-- | fs/ocfs2/quota_global.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/quota_local.c | 21 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 66 | ||||
-rw-r--r-- | fs/ocfs2/xattr.c | 5 |
17 files changed, 610 insertions, 96 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 678a067d925..9edcde4974a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -475,6 +475,12 @@ struct ocfs2_path { #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) #define path_num_items(_path) ((_path)->p_tree_depth + 1) +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos); +static void ocfs2_adjust_rightmost_records(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec); /* * Reset the actual path elements so that we can re-use the structure * to build another path. Generally, this involves freeing the buffer @@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) } /* + * Change range of the branches in the right most path according to the leaf + * extent block's rightmost record. + */ +static int ocfs2_adjust_rightmost_branch(handle_t *handle, + struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int status; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + path = ocfs2_new_path_from_et(et); + if (!path) { + status = -ENOMEM; + return status; + } + + status = ocfs2_find_path(inode, path, UINT_MAX); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_extend_trans(handle, path_num_items(path) + + handle->h_buffer_credits); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_path(inode, handle, path); + if (status < 0) { + mlog_errno(status); + goto out; + } + + el = path_leaf_el(path); + rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; + + ocfs2_adjust_rightmost_records(inode, handle, path, rec); + +out: + ocfs2_free_path(path); + return status; +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; - u32 new_cpos; + u32 new_cpos, root_end; mlog_entry_void(); @@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, new_blocks = le16_to_cpu(el->l_tree_depth); + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + root_end = ocfs2_sum_rightmost_rec(et->et_root_el); + + /* + * If there is a gap before the root end and the real end + * of the righmost leaf block, we need to remove the gap + * between new_cpos and root_end first so that the tree + * is consistent after we add a new branch(it will start + * from new_cpos). + */ + if (root_end > new_cpos) { + mlog(0, "adjust the cluster end from %u to %u\n", + root_end, new_cpos); + status = ocfs2_adjust_rightmost_branch(handle, inode, et); + if (status) { + mlog_errno(status); + goto bail; + } + } + /* allocate the number of new eb blocks we need */ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), GFP_KERNEL); @@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; - new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); - /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 2a947c44e59..a1163b8b417 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -22,6 +22,9 @@ #include <linux/crc32.h> #include <linux/buffer_head.h> #include <linux/bitops.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/fs.h> #include <asm/byteorder.h> #include <cluster/masklog.h> @@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, ocfs2_hamming_fix(data, blocksize * 8, 0, fix); } + +/* + * Debugfs handling. + */ + +#ifdef CONFIG_DEBUG_FS + +static int blockcheck_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); + +static struct dentry *blockcheck_debugfs_create(const char *name, + struct dentry *parent, + u64 *value) +{ + return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, + &blockcheck_fops); +} + +static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ + if (stats) { + debugfs_remove(stats->b_debug_check); + stats->b_debug_check = NULL; + debugfs_remove(stats->b_debug_failure); + stats->b_debug_failure = NULL; + debugfs_remove(stats->b_debug_recover); + stats->b_debug_recover = NULL; + debugfs_remove(stats->b_debug_dir); + stats->b_debug_dir = NULL; + } +} + +static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + int rc = -EINVAL; + + if (!stats) + goto out; + + stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + if (!stats->b_debug_dir) + goto out; + + stats->b_debug_check = + blockcheck_debugfs_create("blocks_checked", + stats->b_debug_dir, + &stats->b_check_count); + + stats->b_debug_failure = + blockcheck_debugfs_create("checksums_failed", + stats->b_debug_dir, + &stats->b_failure_count); + + stats->b_debug_recover = + blockcheck_debugfs_create("ecc_recoveries", + stats->b_debug_dir, + &stats->b_recover_count); + if (stats->b_debug_check && stats->b_debug_failure && + stats->b_debug_recover) + rc = 0; + +out: + if (rc) + ocfs2_blockcheck_debug_remove(stats); + return rc; +} +#else +static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return 0; +} + +static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +/* Always-called wrappers for starting and stopping the debugfs files */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return ocfs2_blockcheck_debug_install(stats, parent); +} + +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) +{ + ocfs2_blockcheck_debug_remove(stats); +} + +static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_check_count++; + new_count = stats->b_check_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Block check count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_failure_count++; + new_count = stats->b_failure_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Checksum failure count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_recover_count++; + new_count = stats->b_recover_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "ECC recovery count has wrapped\n"); +} + + + +/* + * These are the low-level APIs for using the ocfs2_block_check structure. + */ + /* * This function generates check information for a block. * data is the block to be checked. bc is a pointer to the @@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize, * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate(void *data, size_t blocksize, - struct ocfs2_block_check *bc) + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) { int rc = 0; struct ocfs2_block_check check; u32 crc, ecc; + ocfs2_blockcheck_inc_check(stats); + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); check.bc_ecc = le16_to_cpu(bc->bc_ecc); @@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, if (crc == check.bc_crc32e) goto out; + ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, /* And check the crc32 again */ crc = crc32_le(~0, data, blocksize); - if (crc == check.bc_crc32e) + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); goto out; + } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, - struct ocfs2_block_check *bc) + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) { int i, rc = 0; struct ocfs2_block_check check; @@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, if (!nr) return 0; + ocfs2_blockcheck_inc_check(stats); + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); check.bc_ecc = le16_to_cpu(bc->bc_ecc); @@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, if (crc == check.bc_crc32e) goto out; + ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, /* And check the crc32 again */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); - if (crc == check.bc_crc32e) + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); goto out; + } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc) { int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); - if (ocfs2_meta_ecc(OCFS2_SB(sb))) - rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc); + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc, + &osb->osb_ecc_stats); return rc; } @@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, struct ocfs2_block_check *bc) { int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); - if (ocfs2_meta_ecc(OCFS2_SB(sb))) - rc = ocfs2_block_check_validate_bhs(bhs, nr, bc); + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc, + &osb->osb_ecc_stats); return rc; } diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index 70ec3feda32..d4b69febf70 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -21,6 +21,24 @@ #define OCFS2_BLOCKCHECK_H +/* Count errors and error correction from blockcheck.c */ +struct ocfs2_blockcheck_stats { + spinlock_t b_lock; + u64 b_check_count; /* Number of blocks we've checked */ + u64 b_failure_count; /* Number of failed checksums */ + u64 b_recover_count; /* Number of blocks fixed by ecc */ + + /* + * debugfs entries, used if this is passed to + * ocfs2_blockcheck_stats_debugfs_install() + */ + struct dentry *b_debug_dir; /* Parent of the debugfs files */ + struct dentry *b_debug_check; /* Exposes b_check_count */ + struct dentry *b_debug_failure; /* Exposes b_failure_count */ + struct dentry *b_debug_recover; /* Exposes b_recover_count */ +}; + + /* High level block API */ void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc); @@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, void ocfs2_block_check_compute(void *data, size_t blocksize, struct ocfs2_block_check *bc); int ocfs2_block_check_validate(void *data, size_t blocksize, - struct ocfs2_block_check *bc); + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc); int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, - struct ocfs2_block_check *bc); + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); + +/* Debug Initialization */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); /* * Hamming code functions diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7e72a81bc2d..696c32e5071 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -48,34 +48,33 @@ * only emit the appropriage printk() when the caller passes in a constant * mask, as is almost always the case. * - * All this bitmask nonsense is hidden from the /proc interface so that Joel - * doesn't have an aneurism. Reading the file gives a straight forward - * indication of which bits are on or off: - * ENTRY off - * EXIT off + * All this bitmask nonsense is managed from the files under + * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward + * indication of which bits are allowed (allow) or denied (off/deny). + * ENTRY deny + * EXIT deny * TCP off * MSG off * SOCKET off - * ERROR off - * NOTICE on + * ERROR allow + * NOTICE allow * * Writing changes the state of a given bit and requires a strictly formatted * single write() call: * - * write(fd, "ENTRY on", 8); + * write(fd, "allow", 5); * - * would turn the entry bit on. "1" is also accepted in the place of "on", and - * "off" and "0" behave as expected. + * Echoing allow/deny/off string into the logmask files can flip the bits + * on or off as expected; here is the bash script for example: * - * Some trivial shell can flip all the bits on or off: + * log_mask="/sys/fs/o2cb/log_mask" + * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do + * echo allow >"$log_mask"/"$node" + * done * - * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" - * cat $log_mask | ( - * while read bit status; do - * # $1 is "on" or "off", say - * echo "$bit $1" > $log_mask - * done - * ) + * The debugfs.ocfs2 tool can also flip the bits with the -l option: + * + * debugfs.ocfs2 -l TCP allow */ /* for task_struct */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9fbe849f634..334f231a422 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { - int ret, error = 0; + int ret; struct o2net_msg *msg = NULL; size_t veclen, caller_bytes = 0; struct kvec *vec = NULL; @@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_time(&nst); - ret = wait_event_interruptible(nn->nn_sc_wq, - o2net_tx_can_proceed(nn, &sc, &error)); - if (!ret && error) - ret = error; + wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); if (ret) goto out; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c5752305627..b358f3bf896 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, alloc = ocfs2_clusters_for_bytes(sb, bytes); dx_alloc = 0; + down_write(&oi->ip_alloc_sem); + if (ocfs2_supports_indexed_dirs(osb)) { credits += ocfs2_add_dir_index_credits(sb); @@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - down_write(&oi->ip_alloc_sem); - /* * Prepare for worst case allocation scenario of two separate * extents in the unindexed tree. @@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_sem; + goto out; } if (vfs_dq_alloc_space_nodirty(dir, @@ -3172,10 +3172,8 @@ out_commit: ocfs2_commit_trans(osb, handle); -out_sem: - up_write(&oi->ip_alloc_sem); - out: + up_write(&oi->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); if (meta_ac) @@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, brelse(new_bh); new_bh = NULL; + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; goto do_extend; } + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); mlog(0, "extending dir %llu (i_size = %lld)\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); @@ -3370,9 +3372,6 @@ do_extend: credits++; /* For attaching the new dirent block to the * dx_root */ - down_write(&OCFS2_I(dir)->ip_alloc_sem); - drop_alloc_sem = 1; - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -3435,10 +3434,10 @@ bail_bh: *new_de_bh = new_bh; get_bh(*new_de_bh); bail: - if (drop_alloc_sem) - up_write(&OCFS2_I(dir)->ip_alloc_sem); if (handle) ocfs2_commit_trans(osb, handle); + if (drop_alloc_sem) + up_write(&OCFS2_I(dir)->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e15fc7d5082..6cdeaa76f27 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan_lvb *lvb; + + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + &ocfs2_orphan_scan_lops, osb); + lvb = ocfs2_dlm_lvb(&res->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; +} + void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode, mlog_exit_void(); } +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + lockres = &osb->osb_orphan_scan.os_lockres; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + return status; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION) + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + return status; +} + +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + lockres = &osb->osb_orphan_scan.os_lockres; + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; + lvb->lvb_os_seqno = cpu_to_be32(seqno); + ocfs2_cluster_unlock(osb, lockres, level); +} + int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { @@ -2842,6 +2890,7 @@ local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); ocfs2_cluster_disconnect(osb->cconn, hangup_pending); osb->cconn = NULL; @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e1fd5721cd7..31b90d7b8f5 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb { __be32 lvb_free_entry; }; +#define OCFS2_ORPHAN_LVB_VERSION 1 + +struct ocfs2_orphan_scan_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_os_seqno; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int ex); void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex); +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex); + int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c2a87c885b7..07267e0da90 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file, if (err) goto bail; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto bail; + journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); @@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; - int locked[MAXQUOTAS] = {0, 0}; - int credits, qtype; - struct ocfs2_mem_dqinfo *oinfo; + int qtype; + struct dquot *transfer_from[MAXQUOTAS] = { }; + struct dquot *transfer_to[MAXQUOTAS] = { }; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - credits = OCFS2_INODE_UPDATE_CREDITS; + /* + * Gather pointers to quota structures so that allocation / + * freeing of quota structures happens here and not inside + * vfs_dq_transfer() where we have problems with lock ordering + */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) + transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, + USRQUOTA); + transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, + USRQUOTA); + if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) { + status = -ESRCH; goto bail_unlock; - credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + - ocfs2_calc_qdel_credits(sb, USRQUOTA); - locked[USRQUOTA] = 1; + } } if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) + transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, + GRPQUOTA); + transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, + GRPQUOTA); + if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) { + status = -ESRCH; goto bail_unlock; - credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + - ocfs2_calc_qdel_credits(sb, GRPQUOTA); - locked[GRPQUOTA] = 1; + } } - handle = ocfs2_start_trans(osb, credits); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - for (qtype = 0; qtype < MAXQUOTAS; qtype++) { - if (!locked[qtype]) - continue; - oinfo = sb_dqinfo(sb, qtype)->dqi_priv; - ocfs2_unlock_global_qf(oinfo, 1); - } ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) @@ -1043,6 +1047,12 @@ bail_unlock_rw: bail: brelse(bh); + /* Release quota pointers in case we acquired them */ + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + dqput(transfer_to[qtype]); + dqput(transfer_from[qtype]); + } + if (!status && attr->ia_valid & ATTR_MODE) { status = ocfs2_acl_chmod(inode); if (status < 0) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a20a0f1e37f..4a3b9e6b31a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -28,6 +28,8 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/time.h> +#include <linux/random.h> #define MLOG_MASK_PREFIX ML_JOURNAL #include <cluster/masklog.h> @@ -52,6 +54,8 @@ DEFINE_SPINLOCK(trans_inc_lock); +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000 + static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, int node_num, int slot_num); @@ -1841,6 +1845,113 @@ bail: return status; } +/* + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some + * randomness to the timeout to minimize multple nodes firing the timer at the + * same time. + */ +static inline unsigned long ocfs2_orphan_scan_timeout(void) +{ + unsigned long time; + + get_random_bytes(&time, sizeof(time)); + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000); + return msecs_to_jiffies(time); +} + +/* + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for + * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This + * is done to catch any orphans that are left over in orphan directories. + * + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT + * seconds. It gets an EX lock on os_lockres and checks sequence number + * stored in LVB. If the sequence number has changed, it means some other + * node has done the scan. This node skips the scan and tracks the + * sequence number. If the sequence number didn't change, it means a scan + * hasn't happened. The node queues a scan and increments the + * sequence number in the LVB. + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + int status, i; + u32 seqno = 0; + + os = &osb->osb_orphan_scan; + + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto out; + } + + if (os->os_seqno != seqno) { + os->os_seqno = seqno; + goto unlock; + } + + for (i = 0; i < osb->max_slots; i++) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, + NULL); + /* + * We queued a recovery on orphan slots, increment the sequence + * number and update LVB so other node will skip the scan for a while + */ + seqno++; + os->os_count++; + os->os_scantime = CURRENT_TIME; +unlock: + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX); +out: + return; +} + +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ +void ocfs2_orphan_scan_work(struct work_struct *work) |