diff options
Diffstat (limited to 'fs/btrfs/tree-log.c')
| -rw-r--r-- | fs/btrfs/tree-log.c | 887 |
1 files changed, 567 insertions, 320 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d81cda2e077..d91b0de7c50 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -35,6 +35,49 @@ #define LOG_INODE_EXISTS 1 /* + * directory trouble cases + * + * 1) on rename or unlink, if the inode being unlinked isn't in the fsync + * log, we must force a full commit before doing an fsync of the directory + * where the unlink was done. + * ---> record transid of last unlink/rename per directory + * + * mkdir foo/some_dir + * normal commit + * rename foo/some_dir foo2/some_dir + * mkdir foo/some_dir + * fsync foo/some_dir/some_file + * + * The fsync above will unlink the original some_dir without recording + * it in its new location (foo2). After a crash, some_dir will be gone + * unless the fsync of some_file forces a full commit + * + * 2) we must log any new names for any file or dir that is in the fsync + * log. ---> check inode while renaming/linking. + * + * 2a) we must log any new names for any file or dir during rename + * when the directory they are being removed from was logged. + * ---> check inode and old parent dir during rename + * + * 2a is actually the more important variant. With the extra logging + * a crash might unlink the old name without recreating the new one + * + * 3) after a crash, we must go through any directories with a link count + * of zero and redo the rm -rf + * + * mkdir f1/foo + * normal commit + * rm -rf f1/foo + * fsync(f1) + * + * The directory f1 was fully removed from the FS, but fsync was never + * called on f1, only its parent dir. After a crash the rm -rf must + * be replayed. This must be able to recurse down the entire + * directory tree. The inode link count fixup code takes care of the + * ugly details. + */ + +/* * stages for the tree walking. The first * stage (0) is to only pin down the blocks we find * the second stage (1) is to make sure that all the inodes @@ -47,12 +90,17 @@ #define LOG_WALK_REPLAY_INODES 1 #define LOG_WALK_REPLAY_ALL 2 -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid, int del_all); /* * tree logging is a special write ahead log used to make sure that @@ -78,104 +126,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ /* - * btrfs_add_log_tree adds a new per-subvolume log tree into the - * tree of log tree roots. This must be called with a tree log transaction - * running (see start_log_trans). - */ -static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - struct btrfs_root *new_root = root; - int ret; - u64 objectid = root->root_key.objectid; - - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); - if (IS_ERR(leaf)) { - ret = PTR_ERR(leaf); - return ret; - } - - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); - - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), - BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 0); - btrfs_set_root_used(&root_item, 0); - - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; - - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; - - btrfs_set_root_dirid(&root_item, 0); - - key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, - &root_item); - if (ret) - goto fail; - - new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, - &key); - BUG_ON(!new_root); - - WARN_ON(root->log_root); - root->log_root = new_root; - - /* - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). - */ - new_root->ref_cows = 0; - new_root->last_trans = trans->transid; - - /* - * we need to make sure the root block for this new tree - * is marked as dirty in the dirty_log_pages tree. This - * is how it gets flushed down to disk at tree log commit time. - * - * the tree logging mutex keeps others from coming in and changing - * the new_root->node, so we can safely access it here - */ - set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, - new_root->node->start + new_root->node->len - 1, - GFP_NOFS); - -fail: - return ret; -} - -/* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish @@ -184,6 +134,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -193,9 +151,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, ret = btrfs_add_log_tree(trans, root); BUG_ON(ret); } - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); return 0; } @@ -212,13 +171,27 @@ static int join_running_log_trans(struct btrfs_root *root) if (!root->log_root) return -ENOENT; - mutex_lock(&root->fs_info->tree_log_mutex); + mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; + atomic_inc(&root->log_writers); } - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->log_mutex); + return ret; +} + +/* + * This either makes the current running log transaction wait + * until you call btrfs_end_log_trans() or it makes any future + * log transactions wait until you call btrfs_end_log_trans() + */ +int btrfs_pin_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + mutex_lock(&root->log_mutex); + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); return ret; } @@ -226,12 +199,13 @@ static int join_running_log_trans(struct btrfs_root *root) * indicate we're done making changes to the log tree * and wake up anyone waiting to do a sync */ -static int end_log_trans(struct btrfs_root *root) +int btrfs_end_log_trans(struct btrfs_root *root) { - atomic_dec(&root->fs_info->tree_log_writers); - smp_mb(); - if (waitqueue_active(&root->fs_info->tree_log_wait)) - wake_up(&root->fs_info->tree_log_wait); + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } return 0; } @@ -288,12 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - mutex_unlock(&log->fs_info->pinned_mutex); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -459,18 +430,16 @@ no_copy: static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { + struct btrfs_key key; struct inode *inode; - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - if (is_bad_inode(inode)) { + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { iput(inode); inode = NULL; } @@ -565,11 +534,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -584,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; @@ -598,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid); + 0, root->root_key.objectid, + key->objectid, offset); } else { /* * insert the extent pointer in the extent * allocation tree */ - ret = btrfs_alloc_logged_extent(trans, root, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid, - &ins); + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); BUG_ON(ret); } btrfs_release_path(root, path); @@ -692,6 +660,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = link_to_fixup_dir(trans, root, path, location.objectid); BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); BUG_ON(ret); kfree(name); @@ -828,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, return -ENOENT; inode = read_one_inode(root, key->objectid); - BUG_ON(!dir); + BUG_ON(!inode); ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); @@ -893,6 +862,7 @@ conflict_again: victim_name_len)) { btrfs_inc_nlink(inode); btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); @@ -1011,13 +981,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset--; btrfs_release_path(root, path); } - btrfs_free_path(path); + btrfs_release_path(root, path); if (nlink != inode->i_nlink) { inode->i_nlink = nlink; btrfs_update_inode(trans, root, inode); } BTRFS_I(inode)->index_cnt = (u64)-1; + if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + btrfs_free_path(path); + return 0; } @@ -1060,9 +1037,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, iput(inode); - if (key.offset == 0) - break; - key.offset--; + /* + * fixup on a directory may create new entries, + * make sure we always look for the highset possible + * offset + */ + key.offset = (u64)-1; } btrfs_release_path(root, path); return 0; @@ -1239,8 +1219,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } @@ -1402,11 +1381,11 @@ again: read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); log_di = NULL; - if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { log_di = btrfs_lookup_dir_item(trans, log, log_path, dir_key->objectid, name, name_len, 0); - } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, @@ -1467,7 +1446,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid) + u64 dirid, int del_all) { u64 range_start; u64 range_end; @@ -1497,10 +1476,14 @@ again: range_start = 0; range_end = 0; while (1) { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; + if (del_all) + range_end = (u64)-1; + else { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + } dir_key.offset = range_start; while (1) { @@ -1526,7 +1509,8 @@ again: break; ret = check_item_in_log(trans, root, log, path, - log_path, dir, &found_key); + log_path, dir, + &found_key); BUG_ON(ret); if (found_key.offset == (u64)-1) break; @@ -1603,7 +1587,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid); + root, log, path, key.objectid, 0); BUG_ON(ret); } ret = overwrite_item(wc->trans, root, path, @@ -1622,6 +1606,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); BUG_ON(ret); + + /* if the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done + */ + if (inode->i_nlink == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(wc->trans, + root, inode); + } iput(inode); } ret = link_to_fixup_dir(wc->trans, root, @@ -1704,12 +1699,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, @@ -1750,13 +1743,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); @@ -1807,15 +1797,10 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, - next); - BUG_ON(ret); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, path->nodes[*level]->start, @@ -1879,14 +1864,10 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (orig_level == 0) { - ret = btrfs_drop_leaf_ref(trans, log, - next); - BUG_ON(ret); - } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(log, next->start, @@ -1902,92 +1883,223 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, } } btrfs_free_path(path); - if (wc->free) - free_extent_buffer(log->node); return ret; } -static int wait_log_commit(struct btrfs_root *log) +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static int wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); - u64 transid = log->fs_info->tree_log_transid; + int index = transid % 2; + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ do { - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + + if (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])) + schedule(); + + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])); + return 0; +} + +static int wait_for_writer(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) schedule(); - finish_wait(&log->fs_info->tree_log_wait, &wait); - mutex_lock(&log->fs_info->tree_log_mutex); - } while (transid == log->fs_info->tree_log_transid && - atomic_read(&log->fs_info->tree_log_commit)); + mutex_lock(&root->log_mutex); + finish_wait(&root->log_writer_wait, &wait); + } return 0; } /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk + * you know that any inodes previously logged are safely on disk only + * if it returns 0. + * + * Any other return value means you need to call btrfs_commit_transaction. + * Some of the edge cases for fsyncing directories that have had unlinks + * or renames done in the past mean that sometimes the only safe + * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, + * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + int index1; + int index2; int ret; - unsigned long batch; struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - mutex_lock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) { - wait_log_commit(log); - goto out; + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(trans, root, root->log_transid); + mutex_unlock(&root->log_mutex); + return 0; } - atomic_set(&log->fs_info->tree_log_commit, 1); + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(trans, root, root->log_transid - 1); while (1) { - batch = log->fs_info->tree_log_batch; - mutex_unlock(&log->fs_info->tree_log_mutex); + unsigned long batch = root->log_batch; + mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); - mutex_lock(&log->fs_info->tree_log_mutex); - - while (atomic_read(&log->fs_info->tree_log_writers)) { - DEFINE_WAIT(wait); - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_writers)) - schedule(); - mutex_lock(&log->fs_info->tree_log_mutex); - finish_wait(&log->fs_info->tree_log_wait, &wait); - } - if (batch == log->fs_info->tree_log_batch) + mutex_lock(&root->log_mutex); + + wait_for_writer(trans, root); + if (batch == root->log_batch) break; } + /* bail out if we need to do a full commit */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + ret = -EAGAIN; + mutex_unlock(&root->log_mutex); + goto out; + } + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, - &root->fs_info->log_root_tree->dirty_log_pages); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; + root->log_transid++; + log->log_transid = root->log_transid; + smp_mb(); + /* + * log tree has been flushed to disk, new modifications of + * the log will be written to new positions. so it's safe to + * allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_writers); + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + BUG_ON(ret); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid - 1); + } + + wait_for_writer(trans, log_root_tree); + + /* + * now that we've moved on to the tree of log tree roots, + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages); BUG_ON(ret); btrfs_set_super_log_root(&root->fs_info->super_for_commit, - log->fs_info->log_root_tree->node->start); + log_root_tree->node->start); btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, - btrfs_header_level(log->fs_info->log_root_tree->node)); + btrfs_header_level(log_root_tree->node)); - write_ctree_super(trans, log->fs_info->tree_root, 2); - log->fs_info->tree_log_transid++; - log->fs_info->tree_log_batch = 0; - atomic_set(&log->fs_info->tree_log_commit, 0); + log_root_tree->log_batch = 0; + log_root_tree->log_transid++; smp_mb(); - if (waitqueue_active(&log->fs_info->tree_log_wait)) - wake_up(&log->fs_info->tree_log_wait); + + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + write_ctree_super(trans, root->fs_info->tree_root, 2); + ret = 0; + +out_wake_log_root: + atomic_set(&log_root_tree->log_commit[index2], 0); + smp_mb(); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); out: - mutex_unlock(&log->fs_info->tree_log_mutex); + atomic_set(&root->log_commit[index1], 0); + smp_mb(); + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); return 0; } -/* * free all the extents used by the tree log. This should be called +/* + * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2019,38 +2131,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) start, end, GFP_NOFS); } - log = root->log_root; - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); + if (log->log_transid > 0) { + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + } root->log_root = NULL; - kfree(root->log_root); + free_extent_buffer(log->node); + kfree(log); return 0; } /* - * helper function to update the item for a given subvolumes log root - * in the tree of log roots - */ -static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) -{ - u64 bytenr = btrfs_root_bytenr(&log->root_item); - int ret; - - if (log->node->start == bytenr) - return 0; - - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, - &log->root_key, &log->root_item); - BUG_ON(ret); - return ret; -} - -/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -2141,7 +2233,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return 0; } @@ -2168,7 +2260,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); - end_log_trans(root); + btrfs_end_log_trans(root); return ret; } @@ -2466,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_keys, ins_sizes, nr); BUG_ON(ret); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); @@ -2502,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds = btrfs_file_extent_disk_bytenr(src, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(src, - extent); - u64 cs = btrfs_file_extent_offset(src, extent); - u64 cl = btrfs_file_extent_num_bytes(src, - extent);; + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent);; if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } - /* ds == 0 is a hole */ - if (ds != 0) { - ret = btrfs_inc_extent_ref(trans, log, - ds, dl, - dst_path->nodes[0]->start, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, - ins_keys[i].objectid); - BUG_ON(ret); - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); - } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); } } - dst_path->slots[0]++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); @@ -2568,7 +2655,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * * This handles both files and directories. */ -static int __btrfs_log_inode(struct btrfs_trans_handle *trans, +static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only) { @@ -2594,28 +2681,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans, min_key.offset = 0; max_key.objectid = inode->i_ino; + + /* today the code can only do partial logging of directories */ + if (!S_ISDIR(inode->i_mode)) + inode_only = LOG_INODE_ALL; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* - * if this inode has already been logged and we're in inode_only - * mode, we don't want to delete the things that have already - * been written to the log. - * - * But, if the inode has been through an inode_only log, - * the logged_trans field is not set. This allows us to catch - * any new names for this inode in the backrefs by logging it - * again - */ - if (inode_only == LOG_INODE_EXISTS && - BTRFS_I(inode)->logged_trans == trans->transid) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - goto out; - } mutex_lock(&BTRFS_I(inode)->log_mutex); /* @@ -2702,7 +2778,6 @@ next_slot: if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - BTRFS_I(inode)->log_dirty_trans = 0; ret = log_directory_changes(trans, root, inode, path, dst_path); BUG_ON(ret); } @@ -2711,24 +2786,69 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); - - mutex_lock(&root->fs_info->tree_log_mutex); - ret = update_log_root(trans, log); - BUG_ON(ret); - mutex_unlock(&root->fs_info->tree_log_mutex); -out: return 0; } -int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) +/* + * follow the dentry parent pointers up the chain and see if any + * of the directories in it require a full commit before they can + * be logged. Returns zero if nothing special needs to be done or 1 if + * a full commit is required. + */ +static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, + struct inode *inode, + struct dentry *parent, + struct super_block *sb, + u64 last_committed) { - int ret; + int ret = 0; + struct btrfs_root *root; - start_log_trans(trans, root); - ret = __btrfs_log_inode(trans, root, inode, inode_only); - end_log_trans(root); + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto out; + + if (!S_ISDIR(inode->i_mode)) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + goto out; + inode = parent->d_inode; + } + + while (1) { + BTRFS_I(inode)->logged_trans = trans->transid; + smp_mb(); + + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + root = BTRFS_I(inode)->root; + + /* + * make sure any commits to the log are forced + * to be full commits + */ + root->fs_info->last_trans_log_full_commit = + trans->transid; + ret = 1; + break; + } + + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + break; + + if (parent == sb->s_root) + break; + + parent = parent->d_parent; + inode = parent->d_inode; + + } +out: return ret; } @@ -2738,31 +2858,70 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans, * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_dentry(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) +int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { - int inode_only = LOG_INODE_ALL; + int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; - int ret; + int ret = 0; + u64 last_committed = root->fs_info->last_trans_committed; + + sb = inode->i_sb; + + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + + if (root->fs_info->last_trans_log_full_commit > + root->fs_info->last_trans_committed) { + ret = 1; + goto end_no_trans; + } + + ret = check_parent_dirs_for_sync(trans, inode, parent, + sb, last_committed); + if (ret) + goto end_no_trans; start_log_trans(trans, root); - sb = dentry->d_inode->i_sb; - while (1) { - ret = __btrfs_log_inode(trans, root, dentry->d_inode, - inode_only); - BUG_ON(ret); - inode_only = LOG_INODE_EXISTS; - dentry = dentry->d_parent; - if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + + /* + * for regular files, if its inode is already on disk, we don't + * have to worry about the parents at all. This is because + * we can use the last_unlink_trans field to record renames + * and other fun in this file. + */ + if (S_ISREG(inode->i_mode) && + BTRFS_I(inode)->generation <= last_committed && + BTRFS_I(inode)->last_unlink_trans <= last_committed) + goto no_parent; + + inode_only = LOG_INODE_EXISTS; + while (1) { + if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (BTRFS_I(dentry->d_inode)->generation <= - root->fs_info->last_trans_committed) + inode = parent->d_inode; + if (BTRFS_I(inode)->generation > + root->fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, inode_only); + BUG_ON(ret); + } + if (parent == sb->s_root) break; + + parent = parent->d_parent; } - end_log_trans(root); - return 0; +no_parent: + ret = 0; + btrfs_end_log_trans(root); +end_no_trans: + return ret; } /* @@ -2774,12 +2933,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans, int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - u64 gen; - gen = root->fs_info->last_trans_new_blockgroup; - if (gen > root->fs_info->last_trans_committed) - return 1; - else - return btrfs_log_dentry(trans, root, dentry); + return btrfs_log_inode_parent(trans, root, dentry->d_inode, + dentry->d_parent, 0); } /* @@ -2846,7 +3001,7 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(wc.replay_dest); + btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -2864,6 +3019,7 @@ again: key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); kfree(log); if (found_key.offset == 0) @@ -2896,3 +3052,94 @@ again: kfree(log_root_tree); return 0; } + +/* + * there are some corner cases where we want to force a full + * commit instead of allowing a directory to be logged. + * + * They revolve around files there were unlinked from the directory, and + * this function updates the parent directory so that a full commit is + * properly done if it is fsync'd later after the unlinks are done. + */ +void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + struct inode *dir, struct inode *inode, + int for_rename) +{ + /* + * when we're logging a file, if it hasn't been renamed + * or unlinked, and its inode is fully committed on disk, + * we don't have to worry about walking up the directory chain + * to log its parents. + * + * So, we use the last_unlink_trans field to put this transid + * into the file. When the file is logged we check it and + * don't log the parents if the file is fully on disk. + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this directory was already logged any new + * names for this file/dir will get recorded + */ + smp_mb(); + if (BTRFS_I(dir)->logged_trans == trans->transid) + return; + + /* + * if the inode we're about to unlink was logged, + * the log will be properly updated for any new names + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) + return; + + /* + * when renaming files across directories, if the directory + * there we're unlinking from gets fsync'd later on, there's + * no way to find the destination directory later and fsync it + * properly. So, we have to be conservative and force commits + * so the new name gets discovered. + */ + if (for_rename) + goto record; + + /* we can safely do the unlink without any special recording */ + return; + +record: + BTRFS_I(dir)->last_unlink_trans = trans->transid; +} + +/* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. + * + * It will return zero if all goes well, and it will return 1 if a + * full transaction commit is required. + */ +int btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *old_dir, + struct dentry *parent) +{ + struct btrfs_root * root = BTRFS_I(inode)->root; + + /* + * this will force the logging code to walk the dentry chain + * up for the file + */ + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->last_unlink_trans = trans->transid; + + /* + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ + if (BTRFS_I(inode)->logged_trans <= + root->fs_info->last_trans_committed && + (!old_dir || BTRFS_I(old_dir)->logged_trans <= + root->fs_info->last_trans_committed)) + return 0; + + return btrfs_log_inode_parent(trans, root, inode, parent, 1); +} + |
