diff options
Diffstat (limited to 'fs/btrfs/send.c')
| -rw-r--r-- | fs/btrfs/send.c | 1908 | 
1 files changed, 1400 insertions, 508 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e46e0ed7492..6528aa66218 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,12 +24,12 @@  #include <linux/xattr.h>  #include <linux/posix_acl_xattr.h>  #include <linux/radix-tree.h> -#include <linux/crc32c.h>  #include <linux/vmalloc.h>  #include <linux/string.h>  #include "send.h"  #include "backref.h" +#include "hash.h"  #include "locking.h"  #include "disk-io.h"  #include "btrfs_inode.h" @@ -51,15 +51,18 @@ struct fs_path {  		struct {  			char *start;  			char *end; -			char *prepared;  			char *buf; -			int buf_len; -			unsigned int reversed:1; -			unsigned int virtual_mem:1; +			unsigned short buf_len:15; +			unsigned short reversed:1;  			char inline_buf[];  		}; -		char pad[PAGE_SIZE]; +		/* +		 * Average path length does not exceed 200 bytes, we'll have +		 * better packing in the slab and higher chance to satisfy +		 * a allocation later during send. +		 */ +		char pad[256];  	};  };  #define FS_PATH_INLINE_SIZE \ @@ -88,8 +91,6 @@ struct send_ctx {  	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];  	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */ -	struct vfsmount *mnt; -  	struct btrfs_root *send_root;  	struct btrfs_root *parent_root;  	struct clone_root *clone_roots; @@ -111,6 +112,8 @@ struct send_ctx {  	int cur_inode_deleted;  	u64 cur_inode_size;  	u64 cur_inode_mode; +	u64 cur_inode_rdev; +	u64 cur_inode_last_extent;  	u64 send_progress; @@ -121,8 +124,130 @@ struct send_ctx {  	struct list_head name_cache_list;  	int name_cache_size; -	struct file *cur_inode_filp; +	struct file_ra_state ra; +  	char *read_buf; + +	/* +	 * We process inodes by their increasing order, so if before an +	 * incremental send we reverse the parent/child relationship of +	 * directories such that a directory with a lower inode number was +	 * the parent of a directory with a higher inode number, and the one +	 * becoming the new parent got renamed too, we can't rename/move the +	 * directory with lower inode number when we finish processing it - we +	 * must process the directory with higher inode number first, then +	 * rename/move it and then rename/move the directory with lower inode +	 * number. Example follows. +	 * +	 * Tree state when the first send was performed: +	 * +	 * . +	 * |-- a                   (ino 257) +	 *     |-- b               (ino 258) +	 *         | +	 *         | +	 *         |-- c           (ino 259) +	 *         |   |-- d       (ino 260) +	 *         | +	 *         |-- c2          (ino 261) +	 * +	 * Tree state when the second (incremental) send is performed: +	 * +	 * . +	 * |-- a                   (ino 257) +	 *     |-- b               (ino 258) +	 *         |-- c2          (ino 261) +	 *             |-- d2      (ino 260) +	 *                 |-- cc  (ino 259) +	 * +	 * The sequence of steps that lead to the second state was: +	 * +	 * mv /a/b/c/d /a/b/c2/d2 +	 * mv /a/b/c /a/b/c2/d2/cc +	 * +	 * "c" has lower inode number, but we can't move it (2nd mv operation) +	 * before we move "d", which has higher inode number. +	 * +	 * So we just memorize which move/rename operations must be performed +	 * later when their respective parent is processed and moved/renamed. +	 */ + +	/* Indexed by parent directory inode number. */ +	struct rb_root pending_dir_moves; + +	/* +	 * Reverse index, indexed by the inode number of a directory that +	 * is waiting for the move/rename of its immediate parent before its +	 * own move/rename can be performed. +	 */ +	struct rb_root waiting_dir_moves; + +	/* +	 * A directory that is going to be rm'ed might have a child directory +	 * which is in the pending directory moves index above. In this case, +	 * the directory can only be removed after the move/rename of its child +	 * is performed. Example: +	 * +	 * Parent snapshot: +	 * +	 * .                        (ino 256) +	 * |-- a/                   (ino 257) +	 *     |-- b/               (ino 258) +	 *         |-- c/           (ino 259) +	 *         |   |-- x/       (ino 260) +	 *         | +	 *         |-- y/           (ino 261) +	 * +	 * Send snapshot: +	 * +	 * .                        (ino 256) +	 * |-- a/                   (ino 257) +	 *     |-- b/               (ino 258) +	 *         |-- YY/          (ino 261) +	 *              |-- x/      (ino 260) +	 * +	 * Sequence of steps that lead to the send snapshot: +	 * rm -f /a/b/c/foo.txt +	 * mv /a/b/y /a/b/YY +	 * mv /a/b/c/x /a/b/YY +	 * rmdir /a/b/c +	 * +	 * When the child is processed, its move/rename is delayed until its +	 * parent is processed (as explained above), but all other operations +	 * like update utimes, chown, chgrp, etc, are performed and the paths +	 * that it uses for those operations must use the orphanized name of +	 * its parent (the directory we're going to rm later), so we need to +	 * memorize that name. +	 * +	 * Indexed by the inode number of the directory to be deleted. +	 */ +	struct rb_root orphan_dirs; +}; + +struct pending_dir_move { +	struct rb_node node; +	struct list_head list; +	u64 parent_ino; +	u64 ino; +	u64 gen; +	struct list_head update_refs; +}; + +struct waiting_dir_move { +	struct rb_node node; +	u64 ino; +	/* +	 * There might be some directory that could not be removed because it +	 * was waiting for this directory inode to be moved first. Therefore +	 * after this directory is moved, we can try to rmdir the ino rmdir_ino. +	 */ +	u64 rmdir_ino; +}; + +struct orphan_dir_info { +	struct rb_node node; +	u64 ino; +	u64 gen;  };  struct name_cache_entry { @@ -146,6 +271,20 @@ struct name_cache_entry {  	char name[];  }; +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + +static int need_send_hole(struct send_ctx *sctx) +{ +	return (sctx->parent_root && !sctx->cur_inode_new && +		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && +		S_ISREG(sctx->cur_inode_mode)); +} +  static void fs_path_reset(struct fs_path *p)  {  	if (p->reversed) { @@ -167,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)  	if (!p)  		return NULL;  	p->reversed = 0; -	p->virtual_mem = 0;  	p->buf = p->inline_buf;  	p->buf_len = FS_PATH_INLINE_SIZE;  	fs_path_reset(p); @@ -190,12 +328,8 @@ static void fs_path_free(struct fs_path *p)  {  	if (!p)  		return; -	if (p->buf != p->inline_buf) { -		if (p->virtual_mem) -			vfree(p->buf); -		else -			kfree(p->buf); -	} +	if (p->buf != p->inline_buf) +		kfree(p->buf);  	kfree(p);  } @@ -215,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)  	if (p->buf_len >= len)  		return 0; +	if (len > PATH_MAX) { +		WARN_ON(1); +		return -ENOMEM; +	} +  	path_len = p->end - p->start;  	old_buf_len = p->buf_len; -	len = PAGE_ALIGN(len); +	/* +	 * First time the inline_buf does not suffice +	 */  	if (p->buf == p->inline_buf) { -		tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); -		if (!tmp_buf) { -			tmp_buf = vmalloc(len); -			if (!tmp_buf) -				return -ENOMEM; -			p->virtual_mem = 1; -		} -		memcpy(tmp_buf, p->buf, p->buf_len); -		p->buf = tmp_buf; -		p->buf_len = len; +		tmp_buf = kmalloc(len, GFP_NOFS); +		if (tmp_buf) +			memcpy(tmp_buf, p->buf, old_buf_len);  	} else { -		if (p->virtual_mem) { -			tmp_buf = vmalloc(len); -			if (!tmp_buf) -				return -ENOMEM; -			memcpy(tmp_buf, p->buf, p->buf_len); -			vfree(p->buf); -		} else { -			tmp_buf = krealloc(p->buf, len, GFP_NOFS); -			if (!tmp_buf) { -				tmp_buf = vmalloc(len); -				if (!tmp_buf) -					return -ENOMEM; -				memcpy(tmp_buf, p->buf, p->buf_len); -				kfree(p->buf); -				p->virtual_mem = 1; -			} -		} -		p->buf = tmp_buf; -		p->buf_len = len; +		tmp_buf = krealloc(p->buf, len, GFP_NOFS);  	} +	if (!tmp_buf) +		return -ENOMEM; +	p->buf = tmp_buf; +	/* +	 * The real size of the buffer is bigger, this will let the fast path +	 * happen most of the time +	 */ +	p->buf_len = ksize(p->buf); +  	if (p->reversed) {  		tmp_buf = p->buf + old_buf_len - path_len - 1;  		p->end = p->buf + p->buf_len - 1; @@ -263,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)  	return 0;  } -static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, +				   char **prepared)  {  	int ret;  	int new_len; @@ -279,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)  		if (p->start != p->end)  			*--p->start = '/';  		p->start -= name_len; -		p->prepared = p->start; +		*prepared = p->start;  	} else {  		if (p->start != p->end)  			*p->end++ = '/'; -		p->prepared = p->end; +		*prepared = p->end;  		p->end += name_len;  		*p->end = 0;  	} @@ -295,12 +421,12 @@ out:  static int fs_path_add(struct fs_path *p, const char *name, int name_len)  {  	int ret; +	char *prepared; -	ret = fs_path_prepare_for_add(p, name_len); +	ret = fs_path_prepare_for_add(p, name_len, &prepared);  	if (ret < 0)  		goto out; -	memcpy(p->prepared, name, name_len); -	p->prepared = NULL; +	memcpy(prepared, name, name_len);  out:  	return ret; @@ -309,12 +435,12 @@ out:  static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)  {  	int ret; +	char *prepared; -	ret = fs_path_prepare_for_add(p, p2->end - p2->start); +	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);  	if (ret < 0)  		goto out; -	memcpy(p->prepared, p2->start, p2->end - p2->start); -	p->prepared = NULL; +	memcpy(prepared, p2->start, p2->end - p2->start);  out:  	return ret; @@ -325,28 +451,18 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,  					  unsigned long off, int len)  {  	int ret; +	char *prepared; -	ret = fs_path_prepare_for_add(p, len); +	ret = fs_path_prepare_for_add(p, len, &prepared);  	if (ret < 0)  		goto out; -	read_extent_buffer(eb, p->prepared, off, len); -	p->prepared = NULL; +	read_extent_buffer(eb, prepared, off, len);  out:  	return ret;  } -#if 0 -static void fs_path_remove(struct fs_path *p) -{ -	BUG_ON(p->reversed); -	while (p->start != p->end && *p->end != '/') -		p->end--; -	*p->end = 0; -} -#endif -  static int fs_path_copy(struct fs_path *p, struct fs_path *from)  {  	int ret; @@ -385,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void)  		return NULL;  	path->search_commit_root = 1;  	path->skip_locking = 1; +	path->need_commit_sem = 1;  	return path;  } @@ -437,30 +554,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)  	return 0;  } -#if 0 -static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) -{ -	return tlv_put(sctx, attr, &value, sizeof(value)); -} - -static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) -{ -	__le16 tmp = cpu_to_le16(value); -	return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} - -static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value) -{ -	__le32 tmp = cpu_to_le32(value); -	return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} -#endif +#define TLV_PUT_DEFINE_INT(bits) \ +	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\ +			u##bits attr, u##bits value)			\ +	{								\ +		__le##bits __tmp = cpu_to_le##bits(value);		\ +		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\ +	} -static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) -{ -	__le64 tmp = cpu_to_le64(value); -	return tlv_put(sctx, attr, &tmp, sizeof(tmp)); -} +TLV_PUT_DEFINE_INT(64)  static int tlv_put_string(struct send_ctx *sctx, u16 attr,  			  const char *str, int len) @@ -476,17 +578,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,  	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);  } -#if 0 -static int tlv_put_timespec(struct send_ctx *sctx, u16 attr, -			    struct timespec *ts) -{ -	struct btrfs_timespec bts; -	bts.sec = cpu_to_le64(ts->tv_sec); -	bts.nsec = cpu_to_le32(ts->tv_nsec); -	return tlv_put(sctx, attr, &bts, sizeof(bts)); -} -#endif -  static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,  				  struct extent_buffer *eb,  				  struct btrfs_timespec *ts) @@ -534,12 +625,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,  		if (ret < 0) \  			goto tlv_put_failure; \  	} while (0) -#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \ -	do { \ -		ret = tlv_put_timespec(sctx, attrtype, ts); \ -		if (ret < 0) \ -			goto tlv_put_failure; \ -	} while (0)  #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \  	do { \  		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ @@ -565,10 +650,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)  {  	struct btrfs_cmd_header *hdr; -	if (!sctx->send_buf) { -		WARN_ON(1); +	if (WARN_ON(!sctx->send_buf))  		return -EINVAL; -	}  	BUG_ON(sctx->send_size); @@ -589,7 +672,7 @@ static int send_cmd(struct send_ctx *sctx)  	hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));  	hdr->crc = 0; -	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); +	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);  	hdr->crc = cpu_to_le32(crc);  	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -697,29 +780,22 @@ out:  /*   * Helper function to retrieve some fields from an inode item.   */ -static int get_inode_info(struct btrfs_root *root, -			  u64 ino, u64 *size, u64 *gen, -			  u64 *mode, u64 *uid, u64 *gid, -			  u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, +			  u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, +			  u64 *gid, u64 *rdev)  {  	int ret;  	struct btrfs_inode_item *ii;  	struct btrfs_key key; -	struct btrfs_path *path; - -	path = alloc_path_for_send(); -	if (!path) -		return -ENOMEM;  	key.objectid = ino;  	key.type = BTRFS_INODE_ITEM_KEY;  	key.offset = 0;  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -	if (ret < 0) -		goto out;  	if (ret) { -		ret = -ENOENT; -		goto out; +		if (ret > 0) +			ret = -ENOENT; +		return ret;  	}  	ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -737,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root,  	if (rdev)  		*rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: +	return ret; +} + +static int get_inode_info(struct btrfs_root *root, +			  u64 ino, u64 *size, u64 *gen, +			  u64 *mode, u64 *uid, u64 *gid, +			  u64 *rdev) +{ +	struct btrfs_path *path; +	int ret; + +	path = alloc_path_for_send(); +	if (!path) +		return -ENOMEM; +	ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, +			       rdev);  	btrfs_free_path(path);  	return ret;  } @@ -791,7 +882,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,  	if (found_key->type == BTRFS_INODE_REF_KEY) {  		ptr = (unsigned long)btrfs_item_ptr(eb, slot,  						    struct btrfs_inode_ref); -		item = btrfs_item_nr(eb, slot); +		item = btrfs_item_nr(slot);  		total = btrfs_item_size(eb, item);  		elem_size = sizeof(*iref);  	} else { @@ -884,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	struct btrfs_dir_item *di;  	struct btrfs_key di_key;  	char *buf = NULL; -	char *buf2 = NULL;  	int buf_len; -	int buf_virtual = 0;  	u32 name_len;  	u32 data_len;  	u32 cur; @@ -896,7 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	int num;  	u8 type; -	buf_len = PAGE_SIZE; +	if (found_key->type == BTRFS_XATTR_ITEM_KEY) +		buf_len = BTRFS_MAX_XATTR_SIZE(root); +	else +		buf_len = PATH_MAX; +  	buf = kmalloc(buf_len, GFP_NOFS);  	if (!buf) {  		ret = -ENOMEM; @@ -905,7 +998,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	eb = path->nodes[0];  	slot = path->slots[0]; -	item = btrfs_item_nr(eb, slot); +	item = btrfs_item_nr(slot);  	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);  	cur = 0;  	len = 0; @@ -918,30 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  		type = btrfs_dir_type(eb, di);  		btrfs_dir_item_key_to_cpu(eb, di, &di_key); -		if (name_len + data_len > buf_len) { -			buf_len = PAGE_ALIGN(name_len + data_len); -			if (buf_virtual) { -				buf2 = vmalloc(buf_len); -				if (!buf2) { -					ret = -ENOMEM; -					goto out; -				} -				vfree(buf); -			} else { -				buf2 = krealloc(buf, buf_len, GFP_NOFS); -				if (!buf2) { -					buf2 = vmalloc(buf_len); -					if (!buf2) { -						ret = -ENOMEM; -						goto out; -					} -					kfree(buf); -					buf_virtual = 1; -				} +		if (type == BTRFS_FT_XATTR) { +			if (name_len > XATTR_NAME_MAX) { +				ret = -ENAMETOOLONG; +				goto out; +			} +			if (name_len + data_len > buf_len) { +				ret = -E2BIG; +				goto out; +			} +		} else { +			/* +			 * Path too long +			 */ +			if (name_len + data_len > buf_len) { +				ret = -ENAMETOOLONG; +				goto out;  			} - -			buf = buf2; -			buf2 = NULL;  		}  		read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -964,10 +1050,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,  	}  out: -	if (buf_virtual) -		vfree(buf); -	else -		kfree(buf); +	kfree(buf);  	return ret;  } @@ -1035,6 +1118,7 @@ out:  struct backref_ctx {  	struct send_ctx *sctx; +	struct btrfs_path *path;  	/* number of total found references */  	u64 found; @@ -1105,8 +1189,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)  	 * There are inodes that have extents that lie behind its i_size. Don't  	 * accept clones from these extents.  	 */ -	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, -			NULL); +	ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, +			       NULL, NULL, NULL); +	btrfs_release_path(bctx->path);  	if (ret < 0)  		return ret; @@ -1185,12 +1270,17 @@ static int find_extent_clone(struct send_ctx *sctx,  	if (!tmp_path)  		return -ENOMEM; +	/* We only use this path under the commit sem */ +	tmp_path->need_commit_sem = 0; +  	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);  	if (!backref_ctx) {  		ret = -ENOMEM;  		goto out;  	} +	backref_ctx->path = tmp_path; +  	if (data_offset >= ino_size) {  		/*  		 * There may be extents that lie behind the file's size. @@ -1218,8 +1308,10 @@ static int find_extent_clone(struct send_ctx *sctx,  	}  	logical = disk_byte + btrfs_file_extent_offset(eb, fi); +	down_read(&sctx->send_root->fs_info->commit_root_sem);  	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,  				  &found_key, &flags); +	up_read(&sctx->send_root->fs_info->commit_root_sem);  	btrfs_release_path(tmp_path);  	if (ret < 0) @@ -1261,8 +1353,6 @@ static int find_extent_clone(struct send_ctx *sctx,  		extent_item_pos = logical - found_key.objectid;  	else  		extent_item_pos = 0; - -	extent_item_pos = logical - found_key.objectid;  	ret = iterate_extent_inodes(sctx->send_root->fs_info,  					found_key.objectid, extent_item_pos, 1,  					__iterate_backrefs, backref_ctx); @@ -1273,9 +1363,9 @@ static int find_extent_clone(struct send_ctx *sctx,  	if (!backref_ctx->found_itself) {  		/* found a bug in backref code? */  		ret = -EIO; -		printk(KERN_ERR "btrfs: ERROR did not find backref in " +		btrfs_err(sctx->send_root->fs_info, "did not find backref in "  				"send_root. inode=%llu, offset=%llu, " -				"disk_byte=%llu found extent=%llu\n", +				"disk_byte=%llu found extent=%llu",  				ino, data_offset, disk_byte, found_key.objectid);  		goto out;  	} @@ -1301,6 +1391,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "  	}  	if (cur_clone_root) { +		if (compressed != BTRFS_COMPRESS_NONE) { +			/* +			 * Offsets given by iterate_extent_inodes() are relative +			 * to the start of the extent, we need to add logical +			 * offset from the file extent item. +			 * (See why at backref.c:check_extent_in_eb()) +			 */ +			cur_clone_root->offset += btrfs_file_extent_offset(eb, +									   fi); +		}  		*found = cur_clone_root;  		ret = 0;  	} else { @@ -1346,7 +1446,7 @@ static int read_symlink(struct btrfs_root *root,  	BUG_ON(compression);  	off = btrfs_file_extent_inline_start(ei); -	len = btrfs_file_extent_inline_len(path->nodes[0], ei); +	len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);  	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); @@ -1375,13 +1475,9 @@ static int gen_unique_name(struct send_ctx *sctx,  		return -ENOMEM;  	while (1) { -		len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", +		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",  				ino, gen, idx); -		if (len >= sizeof(tmp)) { -			/* should really not happen */ -			ret = -EOVERFLOW; -			goto out; -		} +		ASSERT(len < sizeof(tmp));  		di = btrfs_lookup_dir_item(NULL, sctx->send_root,  				path, BTRFS_FIRST_FREE_OBJECTID, @@ -1548,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,  		goto out;  	}  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); +	if (key.type == BTRFS_ROOT_ITEM_KEY) { +		ret = -ENOENT; +		goto out; +	}  	*found_inode = key.objectid;  	*found_type = btrfs_dir_type(path->nodes[0], di); @@ -1591,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,  		goto out;  	} -	if (key.type == BTRFS_INODE_REF_KEY) { +	if (found_key.type == BTRFS_INODE_REF_KEY) {  		struct btrfs_inode_ref *iref;  		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],  				      struct btrfs_inode_ref); @@ -1613,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,  		goto out;  	btrfs_release_path(path); -	ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, -			NULL, NULL); -	if (ret < 0) -		goto out; +	if (dir_gen) { +		ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, +				     NULL, NULL, NULL); +		if (ret < 0) +			goto out; +	}  	*dir = parent_dir; @@ -1632,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,  	int ret;  	struct fs_path *tmp_name;  	u64 tmp_dir; -	u64 tmp_dir_gen;  	tmp_name = fs_path_alloc();  	if (!tmp_name)  		return -ENOMEM; -	ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); +	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);  	if (ret < 0)  		goto out; @@ -1857,13 +1958,20 @@ static void name_cache_delete(struct send_ctx *sctx,  	nce_head = radix_tree_lookup(&sctx->name_cache,  			(unsigned long)nce->ino); -	BUG_ON(!nce_head); +	if (!nce_head) { +		btrfs_err(sctx->send_root->fs_info, +	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", +			nce->ino, sctx->name_cache_size); +	}  	list_del(&nce->radix_list);  	list_del(&nce->list);  	sctx->name_cache_size--; -	if (list_empty(nce_head)) { +	/* +	 * We may not get to the final release of nce_head if the lookup fails +	 */ +	if (nce_head && list_empty(nce_head)) {  		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);  		kfree(nce_head);  	} @@ -1942,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  {  	int ret;  	int nce_ret; -	struct btrfs_path *path = NULL;  	struct name_cache_entry *nce = NULL;  	/* @@ -1968,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,  		}  	} -	path = alloc_path_for_send(); -	if (!path) -		return -ENOMEM; -  	/*  	 * If the inode is not existent yet, add the orphan name and return 1.  	 * This should only happen for the parent dir that we determine in @@ -2047,7 +2150,6 @@ out_cache:  	name_cache_clean_unused(sctx);  out: -	btrfs_free_path(path);  	return ret;  } @@ -2097,12 +2199,27 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,  	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {  		fs_path_reset(name); -		ret = __get_cur_name_and_parent(sctx, ino, gen, -				&parent_inode, &parent_gen, name); +		if (is_waiting_for_rm(sctx, ino)) { +			ret = gen_unique_name(sctx, ino, gen, name); +			if (ret < 0) +				goto out; +			ret = fs_path_add_path(dest, name); +			break; +		} + +		if (is_waiting_for_move(sctx, ino)) { +			ret = get_first_ref(sctx->parent_root, ino, +					    &parent_inode, &parent_gen, name); +		} else { +			ret = __get_cur_name_and_parent(sctx, ino, gen, +							&parent_inode, +							&parent_gen, name); +			if (ret) +				stop = 1; +		} +  		if (ret < 0)  			goto out; -		if (ret) -			stop = 1;  		ret = fs_path_add_path(dest, name);  		if (ret < 0) @@ -2120,77 +2237,6 @@ out:  }  /* - * Called for regular files when sending extents data. Opens a struct file - * to read from the file. - */ -static int open_cur_inode_file(struct send_ctx *sctx) -{ -	int ret = 0; -	struct btrfs_key key; -	struct path path; -	struct inode *inode; -	struct dentry *dentry; -	struct file *filp; -	int new = 0; - -	if (sctx->cur_inode_filp) -		goto out; - -	key.objectid = sctx->cur_ino; -	key.type = BTRFS_INODE_ITEM_KEY; -	key.offset = 0; - -	inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root, -			&new); -	if (IS_ERR(inode)) { -		ret = PTR_ERR(inode); -		goto out; -	} - -	dentry = d_obtain_alias(inode); -	inode = NULL; -	if (IS_ERR(dentry)) { -		ret = PTR_ERR(dentry); -		goto out; -	} - -	path.mnt = sctx->mnt; -	path.dentry = dentry; -	filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred()); -	dput(dentry); -	dentry = NULL; -	if (IS_ERR(filp)) { -		ret = PTR_ERR(filp); -		goto out; -	} -	sctx->cur_inode_filp = filp; - -out: -	/* -	 * no xxxput required here as every vfs op -	 * does it by itself on failure -	 */ -	return ret; -} - -/* - * Closes the struct file that was created in open_cur_inode_file - */ -static int close_cur_inode_file(struct send_ctx *sctx) -{ -	int ret = 0; - -	if (!sctx->cur_inode_filp) -		goto out; - -	ret = filp_close(sctx->cur_inode_filp, NULL); -	sctx->cur_inode_filp = NULL; - -out: -	return ret; -} - -/*   * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace   */  static int send_subvol_begin(struct send_ctx *sctx) @@ -2205,7 +2251,7 @@ static int send_subvol_begin(struct send_ctx *sctx)  	char *name = NULL;  	int namelen; -	path = alloc_path_for_send(); +	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -2254,12 +2300,12 @@ static int send_subvol_begin(struct send_ctx *sctx)  	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,  			sctx->send_root->root_item.uuid);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, -			sctx->send_root->root_item.ctransid); +		    le64_to_cpu(sctx->send_root->root_item.ctransid));  	if (parent_root) {  		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,  				sctx->parent_root->root_item.uuid);  		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, -				sctx->parent_root->root_item.ctransid); +			    le64_to_cpu(sctx->parent_root->root_item.ctransid));  	}  	ret = send_cmd(sctx); @@ -2437,10 +2483,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);  	if (!p)  		return -ENOMEM; -	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, -			NULL, &rdev); -	if (ret < 0) -		goto out; +	if (ino != sctx->cur_ino) { +		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, +				     NULL, NULL, &rdev); +		if (ret < 0) +			goto out; +	} else { +		gen = sctx->cur_inode_gen; +		mode = sctx->cur_inode_mode; +		rdev = sctx->cur_inode_rdev; +	}  	if (S_ISREG(mode)) {  		cmd = BTRFS_SEND_C_MKFILE; @@ -2520,17 +2572,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)  	key.objectid = dir;  	key.type = BTRFS_DIR_INDEX_KEY;  	key.offset = 0; +	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); +	if (ret < 0) +		goto out; +  	while (1) { -		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, -				1, 0); -		if (ret < 0) -			goto out; -		if (!ret) { -			eb = path->nodes[0]; -			slot = path->slots[0]; -			btrfs_item_key_to_cpu(eb, &found_key, slot); +		eb = path->nodes[0]; +		slot = path->slots[0]; +		if (slot >= btrfs_header_nritems(eb)) { +			ret = btrfs_next_leaf(sctx->send_root, path); +			if (ret < 0) { +				goto out; +			} else if (ret > 0) { +				ret = 0; +				break; +			} +			continue;  		} -		if (ret || found_key.objectid != key.objectid || + +		btrfs_item_key_to_cpu(eb, &found_key, slot); +		if (found_key.objectid != key.objectid ||  		    found_key.type != key.type) {  			ret = 0;  			goto out; @@ -2545,8 +2606,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)  			goto out;  		} -		key.offset = found_key.offset + 1; -		btrfs_release_path(path); +		path->slots[0]++;  	}  out: @@ -2598,7 +2658,7 @@ struct recorded_ref {   * everything mixed. So we first record all refs and later process them.   * This function is a helper to record one ref.   */ -static int record_ref(struct list_head *head, u64 dir, +static int __record_ref(struct list_head *head, u64 dir,  		      u64 dir_gen, struct fs_path *path)  {  	struct recorded_ref *ref; @@ -2684,12 +2744,78 @@ out:  	return ret;  } +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ +	struct rb_node **p = &sctx->orphan_dirs.rb_node; +	struct rb_node *parent = NULL; +	struct orphan_dir_info *entry, *odi; + +	odi = kmalloc(sizeof(*odi), GFP_NOFS); +	if (!odi) +		return ERR_PTR(-ENOMEM); +	odi->ino = dir_ino; +	odi->gen = 0; + +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct orphan_dir_info, node); +		if (dir_ino < entry->ino) { +			p = &(*p)->rb_left; +		} else if (dir_ino > entry->ino) { +			p = &(*p)->rb_right; +		} else { +			kfree(odi); +			return entry; +		} +	} + +	rb_link_node(&odi->node, parent, p); +	rb_insert_color(&odi->node, &sctx->orphan_dirs); +	return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ +	struct rb_node *n = sctx->orphan_dirs.rb_node; +	struct orphan_dir_info *entry; + +	while (n) { +		entry = rb_entry(n, struct orphan_dir_info, node); +		if (dir_ino < entry->ino) +			n = n->rb_left; +		else if (dir_ino > entry->ino) +			n = n->rb_right; +		else +			return entry; +	} +	return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ +	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + +	return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, +				 struct orphan_dir_info *odi) +{ +	if (!odi) +		return; +	rb_erase(&odi->node, &sctx->orphan_dirs); +	kfree(odi); +} +  /*   * Returns 1 if a directory can be removed at this point in time.   * We check this by iterating all dir items and checking if the inode behind   * the dir item was already processed.   */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, +		     u64 send_progress)  {  	int ret = 0;  	struct btrfs_root *root = sctx->parent_root; @@ -2712,31 +2838,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)  	key.objectid = dir;  	key.type = BTRFS_DIR_INDEX_KEY;  	key.offset = 0; +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out;  	while (1) { -		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); -		if (ret < 0) -			goto out; -		if (!ret) { -			btrfs_item_key_to_cpu(path->nodes[0], &found_key, -					path->slots[0]); +		struct waiting_dir_move *dm; + +		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				goto out; +			else if (ret > 0) +				break; +			continue;  		} -		if (ret || found_key.objectid != key.objectid || -		    found_key.type != key.type) { +		btrfs_item_key_to_cpu(path->nodes[0], &found_key, +				      path->slots[0]); +		if (found_key.objectid != key.objectid || +		    found_key.type != key.type)  			break; -		}  		di = btrfs_item_ptr(path->nodes[0], path->slots[0],  				struct btrfs_dir_item);  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); +		dm = get_waiting_dir_move(sctx, loc.objectid); +		if (dm) { +			struct orphan_dir_info *odi; + +			odi = add_orphan_dir_info(sctx, dir); +			if (IS_ERR(odi)) { +				ret = PTR_ERR(odi); +				goto out; +			} +			odi->gen = dir_gen; +			dm->rmdir_ino = dir; +			ret = 0; +			goto out; +		} +  		if (loc.objectid > send_progress) {  			ret = 0;  			goto out;  		} -		btrfs_release_path(path); -		key.offset = found_key.offset + 1; +		path->slots[0]++;  	}  	ret = 1; @@ -2746,10 +2893,452 @@ out:  	return ret;  } +static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) +{ +	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); + +	return entry != NULL; +} + +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ +	struct rb_node **p = &sctx->waiting_dir_moves.rb_node; +	struct rb_node *parent = NULL; +	struct waiting_dir_move *entry, *dm; + +	dm = kmalloc(sizeof(*dm), GFP_NOFS); +	if (!dm) +		return -ENOMEM; +	dm->ino = ino; +	dm->rmdir_ino = 0; + +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct waiting_dir_move, node); +		if (ino < entry->ino) { +			p = &(*p)->rb_left; +		} else if (ino > entry->ino) { +			p = &(*p)->rb_right; +		} else { +			kfree(dm); +			return -EEXIST; +		} +	} + +	rb_link_node(&dm->node, parent, p); +	rb_insert_color(&dm->node, &sctx->waiting_dir_moves); +	return 0; +} + +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) +{ +	struct rb_node *n = sctx->waiting_dir_moves.rb_node; +	struct waiting_dir_move *entry; + +	while (n) { +		entry = rb_entry(n, struct waiting_dir_move, node); +		if (ino < entry->ino) +			n = n->rb_left; +		else if (ino > entry->ino) +			n = n->rb_right; +		else +			return entry; +	} +	return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, +				  struct waiting_dir_move *dm) +{ +	if (!dm) +		return; +	rb_erase(&dm->node, &sctx->waiting_dir_moves); +	kfree(dm); +} + +static int add_pending_dir_move(struct send_ctx *sctx, +				u64 ino, +				u64 ino_gen, +				u64 parent_ino, +				struct list_head *new_refs, +				struct list_head *deleted_refs) +{ +	struct rb_node **p = &sctx->pending_dir_moves.rb_node; +	struct rb_node *parent = NULL; +	struct pending_dir_move *entry = NULL, *pm; +	struct recorded_ref *cur; +	int exists = 0; +	int ret; + +	pm = kmalloc(sizeof(*pm), GFP_NOFS); +	if (!pm) +		return -ENOMEM; +	pm->parent_ino = parent_ino; +	pm->ino = ino; +	pm->gen = ino_gen; +	INIT_LIST_HEAD(&pm->list); +	INIT_LIST_HEAD(&pm->update_refs); +	RB_CLEAR_NODE(&pm->node); + +	while (*p) { +		parent = *p; +		entry = rb_entry(parent, struct pending_dir_move, node); +		if (parent_ino < entry->parent_ino) { +			p = &(*p)->rb_left; +		} else if (parent_ino > entry->parent_ino) { +			p = &(*p)->rb_right; +		} else { +			exists = 1; +			break; +		} +	} + +	list_for_each_entry(cur, deleted_refs, list) { +		ret = dup_ref(cur, &pm->update_refs); +		if (ret < 0) +			goto out; +	} +	list_for_each_entry(cur, new_refs, list) { +		ret = dup_ref(cur, &pm->update_refs); +		if (ret < 0) +			goto out; +	} + +	ret = add_waiting_dir_move(sctx, pm->ino); +	if (ret) +		goto out; + +	if (exists) { +		list_add_tail(&pm->list, &entry->list); +	} else { +		rb_link_node(&pm->node, parent, p); +		rb_insert_color(&pm->node, &sctx->pending_dir_moves); +	} +	ret = 0; +out: +	if (ret) { +		__free_recorded_refs(&pm->update_refs); +		kfree(pm); +	} +	return ret; +} + +static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, +						      u64 parent_ino) +{ +	struct rb_node *n = sctx->pending_dir_moves.rb_node; +	struct pending_dir_move *entry; + +	while (n) { +		entry = rb_entry(n, struct pending_dir_move, node); +		if (parent_ino < entry->parent_ino) +			n = n->rb_left; +		else if (parent_ino > entry->parent_ino) +			n = n->rb_right; +		else +			return entry; +	} +	return NULL; +} + +static int path_loop(struct send_ctx *sctx, struct fs_path *name, +		     u64 ino, u64 gen, u64 *ancestor_ino) +{ +	int ret = 0; +	u64 parent_inode = 0; +	u64 parent_gen = 0; +	u64 start_ino = ino; + +	*ancestor_ino = 0; +	while (ino != BTRFS_FIRST_FREE_OBJECTID) { +		fs_path_reset(name); + +		if (is_waiting_for_rm(sctx, ino)) +			break; +		if (is_waiting_for_move(sctx, ino)) { +			if (*ancestor_ino == 0) +				*ancestor_ino = ino; +			ret = get_first_ref(sctx->parent_root, ino, +					    &parent_inode, &parent_gen, name); +		} else { +			ret = __get_cur_name_and_parent(sctx, ino, gen, +							&parent_inode, +							&parent_gen, name); +			if (ret > 0) { +				ret = 0; +				break; +			} +		} +		if (ret < 0) +			break; +		if (parent_inode == start_ino) { +			ret = 1; +			if (*ancestor_ino == 0) +				*ancestor_ino = ino; +			break; +		} +		ino = parent_inode; +		gen = parent_gen; +	} +	return ret; +} + +static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) +{ +	struct fs_path *from_path = NULL; +	struct fs_path *to_path = NULL; +	struct fs_path *name = NULL; +	u64 orig_progress = sctx->send_progress; +	struct recorded_ref *cur; +	u64 parent_ino, parent_gen; +	struct waiting_dir_move *dm = NULL; +	u64 rmdir_ino = 0; +	int ret; +	u64 ancestor = 0; + +	name = fs_path_alloc(); +	from_path = fs_path_alloc(); +	if (!name || !from_path) { +		ret = -ENOMEM; +		goto out; +	} + +	dm = get_waiting_dir_move(sctx, pm->ino); +	ASSERT(dm); +	rmdir_ino = dm->rmdir_ino; +	free_waiting_dir_move(sctx, dm); + +	ret = get_first_ref(sctx->parent_root, pm->ino, +			    &parent_ino, &parent_gen, name); +	if (ret < 0) +		goto out; + +	ret = get_cur_path(sctx, parent_ino, parent_gen, +			   from_path); +	if (ret < 0) +		goto out; +	ret = fs_path_add_path(from_path, name); +	if (ret < 0) +		goto out; + +	sctx->send_progress = sctx->cur_ino + 1; +	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); +	if (ret) { +		LIST_HEAD(deleted_refs); +		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); +		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, +					   &pm->update_refs, &deleted_refs); +		if (ret < 0) +			goto out; +		if (rmdir_ino) { +			dm = get_waiting_dir_move(sctx, pm->ino); +			ASSERT(dm); +			dm->rmdir_ino = rmdir_ino; +		} +		goto out; +	} +	fs_path_reset(name); +	to_path = name; +	name = NULL; +	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); +	if (ret < 0) +		goto out; + +	ret = send_rename(sctx, from_path, to_path); +	if (ret < 0) +		goto out; + +	if (rmdir_ino) { +		struct orphan_dir_info *odi; + +		odi = get_orphan_dir_info(sctx, rmdir_ino); +		if (!odi) { +			/* already deleted */ +			goto finish; +		} +		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); +		if (ret < 0) +			goto out; +		if (!ret) +			goto finish; + +		name = fs_path_alloc(); +		if (!name) { +			ret = -ENOMEM; +			goto out; +		} +		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); +		if (ret < 0) +			goto out; +		ret = send_rmdir(sctx, name); +		if (ret < 0) +			goto out; +		free_orphan_dir_info(sctx, odi); +	} + +finish: +	ret = send_utimes(sctx, pm->ino, pm->gen); +	if (ret < 0) +		goto out; + +	/* +	 * After rename/move, need to update the utimes of both new parent(s) +	 * and old parent(s). +	 */ +	list_for_each_entry(cur, &pm->update_refs, list) { +		if (cur->dir == rmdir_ino) +			continue; +		ret = send_utimes(sctx, cur->dir, cur->dir_gen); +		if (ret < 0) +			goto out; +	} + +out: +	fs_path_free(name); +	fs_path_free(from_path); +	fs_path_free(to_path); +	sctx->send_progress = orig_progress; + +	return ret; +} + +static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) +{ +	if (!list_empty(&m->list)) +		list_del(&m->list); +	if (!RB_EMPTY_NODE(&m->node)) +		rb_erase(&m->node, &sctx->pending_dir_moves); +	__free_recorded_refs(&m->update_refs); +	kfree(m); +} + +static void tail_append_pending_moves(struct pending_dir_move *moves, +				      struct list_head *stack) +{ +	if (list_empty(&moves->list)) { +		list_add_tail(&moves->list, stack); +	} else { +		LIST_HEAD(list); +		list_splice_init(&moves->list, &list); +		list_add_tail(&moves->list, stack); +		list_splice_tail(&list, stack); +	} +} + +static int apply_children_dir_moves(struct send_ctx *sctx) +{ +	struct pending_dir_move *pm; +	struct list_head stack; +	u64 parent_ino = sctx->cur_ino; +	int ret = 0; + +	pm = get_pending_dir_moves(sctx, parent_ino); +	if (!pm) +		return 0; + +	INIT_LIST_HEAD(&stack); +	tail_append_pending_moves(pm, &stack); + +	while (!list_empty(&stack)) { +		pm = list_first_entry(&stack, struct pending_dir_move, list); +		parent_ino = pm->ino; +		ret = apply_dir_move(sctx, pm); +		free_pending_move(sctx, pm); +		if (ret) +			goto out; +		pm = get_pending_dir_moves(sctx, parent_ino); +		if (pm) +			tail_append_pending_moves(pm, &stack); +	} +	return 0; + +out: +	while (!list_empty(&stack)) { +		pm = list_first_entry(&stack, struct pending_dir_move, list); +		free_pending_move(sctx, pm); +	} +	return ret; +} + +static int wait_for_parent_move(struct send_ctx *sctx, +				struct recorded_ref *parent_ref) +{ +	int ret = 0; +	u64 ino = parent_ref->dir; +	u64 parent_ino_before, parent_ino_after; +	struct fs_path *path_before = NULL; +	struct fs_path *path_after = NULL; +	int len1, len2; + +	path_after = fs_path_alloc(); +	path_before = fs_path_alloc(); +	if (!path_after || !path_before) { +		ret = -ENOMEM; +		goto out; +	} + +	/* +	 * Our current directory inode may not yet be renamed/moved because some +	 * ancestor (immediate or not) has to be renamed/moved first. So find if +	 * such ancestor exists and make sure our own rename/move happens after +	 * that ancestor is processed. +	 */ +	while (ino > BTRFS_FIRST_FREE_OBJECTID) { +		if (is_waiting_for_move(sctx, ino)) { +			ret = 1; +			break; +		} + +		fs_path_reset(path_before); +		fs_path_reset(path_after); + +		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, +				    NULL, path_after); +		if (ret < 0) +			goto out; +		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, +				    NULL, path_before); +		if (ret < 0 && ret != -ENOENT) { +			goto out; +		} else if (ret == -ENOENT) { +			ret = 1; +			break; +		} + +		len1 = fs_path_len(path_before); +		len2 = fs_path_len(path_after); +		if (ino > sctx->cur_ino && +		    (parent_ino_before != parent_ino_after || len1 != len2 || +		     memcmp(path_before->start, path_after->start, len1))) { +			ret = 1; +			break; +		} +		ino = parent_ino_after; +	} + +out: +	fs_path_free(path_before); +	fs_path_free(path_after); + +	if (ret == 1) { +		ret = add_pending_dir_move(sctx, +					   sctx->cur_ino, +					   sctx->cur_inode_gen, +					   ino, +					   &sctx->new_refs, +					   &sctx->deleted_refs); +		if (!ret) +			ret = 1; +	} + +	return ret; +} +  /*   * This does all the move/link/unlink/rmdir magic.   */ -static int process_recorded_refs(struct send_ctx *sctx) +static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)  {  	int ret = 0;  	struct recorded_ref *cur; @@ -2760,6 +3349,7 @@ static int process_recorded_refs(struct send_ctx *sctx)  	u64 ow_gen;  	int did_overwrite = 0;  	int is_orphan = 0; +	u64 last_dir_ino_rm = 0;  verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -2898,11 +3488,18 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  				 * dirs, we always have one new and one deleted  				 * ref. The deleted ref is ignored later.  				 */ -				ret = send_rename(sctx, valid_path, -						cur->full_path); +				ret = wait_for_parent_move(sctx, cur);  				if (ret < 0)  					goto out; -				ret = fs_path_copy(valid_path, cur->full_path); +				if (ret) { +					*pending_move = 1; +				} else { +					ret = send_rename(sctx, valid_path, +							  cur->full_path); +					if (!ret) +						ret = fs_path_copy(valid_path, +							       cur->full_path); +				}  				if (ret < 0)  					goto out;  			} else { @@ -2924,7 +3521,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  		 * later, we do this check again and rmdir it then if possible.  		 * See the use of check_dirs for more details.  		 */ -		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); +		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, +				sctx->cur_ino);  		if (ret < 0)  			goto out;  		if (ret) { @@ -3015,8 +3613,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  			ret = send_utimes(sctx, cur->dir, cur->dir_gen);  			if (ret < 0)  				goto out; -		} else if (ret == inode_state_did_delete) { -			ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); +		} else if (ret == inode_state_did_delete && +			   cur->dir != last_dir_ino_rm) { +			ret = can_rmdir(sctx, cur->dir, cur->dir_gen, +					sctx->cur_ino);  			if (ret < 0)  				goto out;  			if (ret) { @@ -3027,6 +3627,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);  				ret = send_rmdir(sctx, valid_path);  				if (ret < 0)  					goto out; +				last_dir_ino_rm = cur->dir;  			}  		}  	} @@ -3040,9 +3641,8 @@ out:  	return ret;  } -static int __record_new_ref(int num, u64 dir, int index, -			    struct fs_path *name, -			    void *ctx) +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, +		      struct fs_path *name, void *ctx, struct list_head *refs)  {  	int ret = 0;  	struct send_ctx *sctx = ctx; @@ -3053,7 +3653,7 @@ static int __record_new_ref(int num, u64 dir, int index,  	if (!p)  		return -ENOMEM; -	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, +	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,  			NULL, NULL);  	if (ret < 0)  		goto out; @@ -3065,7 +3665,7 @@ static int __record_new_ref(int num, u64 dir, int index,  	if (ret < 0)  		goto out; -	ret = record_ref(&sctx->new_refs, dir, gen, p); +	ret = __record_ref(refs, dir, gen, p);  out:  	if (ret) @@ -3073,37 +3673,23 @@ out:  	return ret;  } +static int __record_new_ref(int num, u64 dir, int index, +			    struct fs_path *name, +			    void *ctx) +{ +	struct send_ctx *sctx = ctx; +	return record_ref(sctx->send_root, num, dir, index, name, +			  ctx, &sctx->new_refs); +} + +  static int __record_deleted_ref(int num, u64 dir, int index,  				struct fs_path *name,  				void *ctx)  { -	int ret = 0;  	struct send_ctx *sctx = ctx; -	struct fs_path *p; -	u64 gen; - -	p = fs_path_alloc(); -	if (!p) -		return -ENOMEM; - -	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, -			NULL, NULL); -	if (ret < 0) -		goto out; - -	ret = get_cur_path(sctx, dir, gen, p); -	if (ret < 0) -		goto out; -	ret = fs_path_add_path(p, name); -	if (ret < 0) -		goto out; - -	ret = record_ref(&sctx->deleted_refs, dir, gen, p); - -out: -	if (ret) -		fs_path_free(p); -	return ret; +	return record_ref(sctx->parent_root, num, dir, index, name, +			  ctx, &sctx->deleted_refs);  }  static int record_new_ref(struct send_ctx *sctx) @@ -3271,6 +3857,7 @@ static int process_all_refs(struct send_ctx *sctx,  	struct extent_buffer *eb;  	int slot;  	iterate_inode_ref_t cb; +	int pending_move = 0;  	path = alloc_path_for_send();  	if (!path) @@ -3283,21 +3870,31 @@ static int process_all_refs(struct send_ctx *sctx,  		root = sctx->parent_root;  		cb = __record_deleted_ref;  	} else { -		BUG(); +		btrfs_err(sctx->send_root->fs_info, +				"Wrong command %d in process_all_refs", cmd); +		ret = -EINVAL; +		goto out;  	}  	key.objectid = sctx->cmp_key->objectid;  	key.type = BTRFS_INODE_REF_KEY;  	key.offset = 0; -	while (1) { -		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); -		if (ret < 0) -			goto out; -		if (ret) -			break; +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out; +	while (1) {  		eb = path->nodes[0];  		slot = path->slots[0]; +		if (slot >= btrfs_header_nritems(eb)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) +				goto out; +			else if (ret > 0) +				break; +			continue; +		} +  		btrfs_item_key_to_cpu(eb, &found_key, slot);  		if (found_key.objectid != key.objectid || @@ -3306,15 +3903,16 @@ static int process_all_refs(struct send_ctx *sctx,  			break;  		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); -		btrfs_release_path(path);  		if (ret < 0)  			goto out; -		key.offset = found_key.offset + 1; +		path->slots[0]++;  	}  	btrfs_release_path(path); -	ret = process_recorded_refs(sctx); +	ret = process_recorded_refs(sctx, &pending_move); +	/* Only applicable to an incremental send. */ +	ASSERT(pending_move == 0);  out:  	btrfs_free_path(path); @@ -3589,19 +4187,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)  	key.objectid = sctx->cmp_key->objectid;  	key.type = BTRFS_XATTR_ITEM_KEY;  	key.offset = 0; -	while (1) { -		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); -		if (ret < 0) -			goto out; -		if (ret) { -			ret = 0; -			goto out; -		} +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out; +	while (1) {  		eb = path->nodes[0];  		slot = path->slots[0]; -		btrfs_item_key_to_cpu(eb, &found_key, slot); +		if (slot >= btrfs_header_nritems(eb)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) { +				goto out; +			} else if (ret > 0) { +				ret = 0; +				break; +			} +			continue; +		} +		btrfs_item_key_to_cpu(eb, &found_key, slot);  		if (found_key.objectid != key.objectid ||  		    found_key.type != key.type) {  			ret = 0; @@ -3613,8 +4217,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)  		if (ret < 0)  			goto out; -		btrfs_release_path(path); -		key.offset = found_key.offset + 1; +		path->slots[0]++;  	}  out: @@ -3622,6 +4225,79 @@ out:  	return ret;  } +static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +{ +	struct btrfs_root *root = sctx->send_root; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct inode *inode; +	struct page *page; +	char *addr; +	struct btrfs_key key; +	pgoff_t index = offset >> PAGE_CACHE_SHIFT; +	pgoff_t last_index; +	unsigned pg_offset = offset & ~PAGE_CACHE_MASK; +	ssize_t ret = 0; + +	key.objectid = sctx->cur_ino; +	key.type = BTRFS_INODE_ITEM_KEY; +	key.offset = 0; + +	inode = btrfs_iget(fs_info->sb, &key, root, NULL); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	if (offset + len > i_size_read(inode)) { +		if (offset > i_size_read(inode)) +			len = 0; +		else +			len = offset - i_size_read(inode); +	} +	if (len == 0) +		goto out; + +	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + +	/* initial readahead */ +	memset(&sctx->ra, 0, sizeof(struct file_ra_state)); +	file_ra_state_init(&sctx->ra, inode->i_mapping); +	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, +		       last_index - index + 1); + +	while (index <= last_index) { +		unsigned cur_len = min_t(unsigned, len, +					 PAGE_CACHE_SIZE - pg_offset); +		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); +		if (!page) { +			ret = -ENOMEM; +			break; +		} + +		if (!PageUptodate(page)) { +			btrfs_readpage(NULL, page); +			lock_page(page); +			if (!PageUptodate(page)) { +				unlock_page(page); +				page_cache_release(page); +				ret = -EIO; +				break; +			} +		} + +		addr = kmap(page); +		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); +		kunmap(page); +		unlock_page(page); +		page_cache_release(page); +		index++; +		pg_offset = 0; +		len -= cur_len; +		ret += cur_len; +	} +out: +	iput(inode); +	return ret; +} +  /*   * Read some bytes from the current inode/file and send a write command to   * user space. @@ -3630,35 +4306,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)  {  	int ret = 0;  	struct fs_path *p; -	loff_t pos = offset; -	int num_read = 0; -	mm_segment_t old_fs; +	ssize_t num_read = 0;  	p = fs_path_alloc();  	if (!p)  		return -ENOMEM; -	/* -	 * vfs normally only accepts user space buffers for security reasons. -	 * we only read from the file and also only provide the read_buf buffer -	 * to vfs. As this buffer does not come from a user space call, it's -	 * ok to temporary allow kernel space buffers. -	 */ -	old_fs = get_fs(); -	set_fs(KERNEL_DS); -  verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); -	ret = open_cur_inode_file(sctx); -	if (ret < 0) -		goto out; - -	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos); -	if (ret < 0) -		goto out; -	num_read = ret; -	if (!num_read) +	num_read = fill_read_buf(sctx, offset, len); +	if (num_read <= 0) { +		if (num_read < 0) +			ret = num_read;  		goto out; +	}  	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);  	if (ret < 0) @@ -3677,7 +4338,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);  tlv_put_failure:  out:  	fs_path_free(p); -	set_fs(old_fs);  	if (ret < 0)  		return ret;  	return num_read; @@ -3730,7 +4390,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "  	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,  			clone_root->root->root_item.uuid);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, -			clone_root->root->root_item.ctransid); +		    le64_to_cpu(clone_root->root->root_item.ctransid));  	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);  	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,  			clone_root->offset); @@ -3776,6 +4436,39 @@ out:  	return ret;  } +static int send_hole(struct send_ctx *sctx, u64 end) +{ +	struct fs_path *p = NULL; +	u64 offset = sctx->cur_inode_last_extent; +	u64 len; +	int ret = 0; + +	p = fs_path_alloc(); +	if (!p) +		return -ENOMEM; +	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); +	if (ret < 0) +		goto tlv_put_failure; +	memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); +	while (offset < end) { +		len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + +		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); +		if (ret < 0) +			break; +		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); +		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); +		TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); +		ret = send_cmd(sctx); +		if (ret < 0) +			break; +		offset += len; +	} +tlv_put_failure: +	fs_path_free(p); +	return ret; +} +  static int send_write_or_clone(struct send_ctx *sctx,  			       struct btrfs_path *path,  			       struct btrfs_key *key, @@ -3788,12 +4481,14 @@ static int send_write_or_clone(struct send_ctx *sctx,  	u64 len;  	u32 l;  	u8 type; +	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;  	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],  			struct btrfs_file_extent_item);  	type = btrfs_file_extent_type(path->nodes[0], ei);  	if (type == BTRFS_FILE_EXTENT_INLINE) { -		len = btrfs_file_extent_inline_len(path->nodes[0], ei); +		len = btrfs_file_extent_inline_len(path->nodes[0], +						   path->slots[0], ei);  		/*  		 * it is possible the inline item won't cover the whole page,  		 * but there may be items after this page.  Make @@ -3811,7 +4506,7 @@ static int send_write_or_clone(struct send_ctx *sctx,  		goto out;  	} -	if (clone_root) { +	if (clone_root && IS_ALIGNED(offset + len, bs)) {  		ret = send_clone(sctx, offset, len, clone_root);  	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {  		ret = send_update_extent(sctx, offset, len); @@ -3926,16 +4621,16 @@ static int is_extent_unchanged(struct send_ctx *sctx,  	while (key.offset < ekey->offset + left_len) {  		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);  		right_type = btrfs_file_extent_type(eb, ei); -		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); -		right_len = btrfs_file_extent_num_bytes(eb, ei); -		right_offset = btrfs_file_extent_offset(eb, ei); -		right_gen = btrfs_file_extent_generation(eb, ei); -  		if (right_type != BTRFS_FILE_EXTENT_REG) {  			ret = 0;  			goto out;  		} +		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); +		right_len = btrfs_file_extent_num_bytes(eb, ei); +		right_offset = btrfs_file_extent_offset(eb, ei); +		right_gen = btrfs_file_extent_generation(eb, ei); +  		/*  		 * Are we at extent 8? If yes, we know the extent is changed.  		 * This may only happen on the first iteration. @@ -4003,6 +4698,101 @@ out:  	return ret;  } +static int get_last_extent(struct send_ctx *sctx, u64 offset) +{ +	struct btrfs_path *path; +	struct btrfs_root *root = sctx->send_root; +	struct btrfs_file_extent_item *fi; +	struct btrfs_key key; +	u64 extent_end; +	u8 type; +	int ret; + +	path = alloc_path_for_send(); +	if (!path) +		return -ENOMEM; + +	sctx->cur_inode_last_extent = 0; + +	key.objectid = sctx->cur_ino; +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = offset; +	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); +	if (ret < 0) +		goto out; +	ret = 0; +	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) +		goto out; + +	fi = btrfs_item_ptr(path->nodes[0], path->slots[0], +			    struct btrfs_file_extent_item); +	type = btrfs_file_extent_type(path->nodes[0], fi); +	if (type == BTRFS_FILE_EXTENT_INLINE) { +		u64 size = btrfs_file_extent_inline_len(path->nodes[0], +							path->slots[0], fi); +		extent_end = ALIGN(key.offset + size, +				   sctx->send_root->sectorsize); +	} else { +		extent_end = key.offset + +			btrfs_file_extent_num_bytes(path->nodes[0], fi); +	} +	sctx->cur_inode_last_extent = extent_end; +out: +	btrfs_free_path(path); +	return ret; +} + +static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, +			   struct btrfs_key *key) +{ +	struct btrfs_file_extent_item *fi; +	u64 extent_end; +	u8 type; +	int ret = 0; + +	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) +		return 0; + +	if (sctx->cur_inode_last_extent == (u64)-1) { +		ret = get_last_extent(sctx, key->offset - 1); +		if (ret) +			return ret; +	} + +	fi = btrfs_item_ptr(path->nodes[0], path->slots[0], +			    struct btrfs_file_extent_item); +	type = btrfs_file_extent_type(path->nodes[0], fi); +	if (type == BTRFS_FILE_EXTENT_INLINE) { +		u64 size = btrfs_file_extent_inline_len(path->nodes[0], +							path->slots[0], fi); +		extent_end = ALIGN(key->offset + size, +				   sctx->send_root->sectorsize); +	} else { +		extent_end = key->offset + +			btrfs_file_extent_num_bytes(path->nodes[0], fi); +	} + +	if (path->slots[0] == 0 && +	    sctx->cur_inode_last_extent < key->offset) { +		/* +		 * We might have skipped entire leafs that contained only +		 * file extent items for our current inode. These leafs have +		 * a generation number smaller (older) than the one in the +		 * current leaf and the leaf our last extent came from, and +		 * are located between these 2 leafs. +		 */ +		ret = get_last_extent(sctx, key->offset - 1); +		if (ret) +			return ret; +	} + +	if (sctx->cur_inode_last_extent < key->offset) +		ret = send_hole(sctx, key->offset); +	sctx->cur_inode_last_extent = extent_end; +	return ret; +} +  static int process_extent(struct send_ctx *sctx,  			  struct btrfs_path *path,  			  struct btrfs_key *key) @@ -4019,7 +4809,7 @@ static int process_extent(struct send_ctx *sctx,  			goto out;  		if (ret) {  			ret = 0; -			goto out; +			goto out_hole;  		}  	} else {  		struct btrfs_file_extent_item *ei; @@ -4055,7 +4845,10 @@ static int process_extent(struct send_ctx *sctx,  		goto out;  	ret = send_write_or_clone(sctx, path, key, found_clone); - +	if (ret) +		goto out; +out_hole: +	ret = maybe_send_hole(sctx, path, key);  out:  	return ret;  } @@ -4078,17 +4871,25 @@ static int process_all_extents(struct send_ctx *sctx)  	key.objectid = sctx->cmp_key->objectid;  	key.type = BTRFS_EXTENT_DATA_KEY;  	key.offset = 0; -	while (1) { -		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); -		if (ret < 0) -			goto out; -		if (ret) { -			ret = 0; -			goto out; -		} +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +	if (ret < 0) +		goto out; +	while (1) {  		eb = path->nodes[0];  		slot = path->slots[0]; + +		if (slot >= btrfs_header_nritems(eb)) { +			ret = btrfs_next_leaf(root, path); +			if (ret < 0) { +				goto out; +			} else if (ret > 0) { +				ret = 0; +				break; +			} +			continue; +		} +  		btrfs_item_key_to_cpu(eb, &found_key, slot);  		if (found_key.objectid != key.objectid || @@ -4101,8 +4902,7 @@ static int process_all_extents(struct send_ctx *sctx)  		if (ret < 0)  			goto out; -		btrfs_release_path(path); -		key.offset = found_key.offset + 1; +		path->slots[0]++;  	}  out: @@ -4110,7 +4910,9 @@ out:  	return ret;  } -static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) +static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, +					   int *pending_move, +					   int *refs_processed)  {  	int ret = 0; @@ -4122,17 +4924,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)  	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))  		goto out; -	ret = process_recorded_refs(sctx); +	ret = process_recorded_refs(sctx, pending_move);  	if (ret < 0)  		goto out; -	/* -	 * We have processed the refs and thus need to advance send_progress. -	 * Now, calls to get_cur_xxx will take the updated refs of the current -	 * inode into account. -	 */ -	sctx->send_progress = sctx->cur_ino + 1; - +	*refs_processed = 1;  out:  	return ret;  } @@ -4148,11 +4944,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)  	u64 right_gid;  	int need_chmod = 0;  	int need_chown = 0; +	int pending_move = 0; +	int refs_processed = 0; -	ret = process_recorded_refs_if_needed(sctx, at_end); +	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, +					      &refs_processed);  	if (ret < 0)  		goto out; +	/* +	 * We have processed the refs and thus need to advance send_progress. +	 * Now, calls to get_cur_xxx will take the updated refs of the current +	 * inode into account. +	 * +	 * On the other hand, if our current inode is a directory and couldn't +	 * be moved/renamed because its parent was renamed/moved too and it has +	 * a higher inode number, we can only move/rename our current inode +	 * after we moved/renamed its parent. Therefore in this case operate on +	 * the old path (pre move/rename) of our current inode, and the +	 * move/rename will be performed later. +	 */ +	if (refs_processed && !pending_move) +		sctx->send_progress = sctx->cur_ino + 1; +  	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)  		goto out;  	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) @@ -4181,6 +4995,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)  	}  	if (S_ISREG(sctx->cur_inode_mode)) { +		if (need_send_hole(sctx)) { +			if (sctx->cur_inode_last_extent == (u64)-1 || +			    sctx->cur_inode_last_extent < +			    sctx->cur_inode_size) { +				ret = get_last_extent(sctx, (u64)-1); +				if (ret) +					goto out; +			} +			if (sctx->cur_inode_last_extent < +			    sctx->cur_inode_size) { +				ret = send_hole(sctx, sctx->cur_inode_size); +				if (ret) +					goto out; +			} +		}  		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,  				sctx->cur_inode_size);  		if (ret < 0) @@ -4201,12 +5030,25 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)  	}  	/* -	 * Need to send that every time, no matter if it actually changed -	 * between the two trees as we have done changes to the inode before. +	 * If other directory inodes depended on our current directory +	 * inode's move/rename, now do their move/rename operations.  	 */ -	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); -	if (ret < 0) -		goto out; +	if (!is_waiting_for_move(sctx, sctx->cur_ino)) { +		ret = apply_children_dir_moves(sctx); +		if (ret) +			goto out; +		/* +		 * Need to send that every time, no matter if it actually +		 * changed between the two trees as we have done changes to +		 * the inode before. If our inode is a directory and it's +		 * waiting to be moved/renamed, we will send its utimes when +		 * it's moved/renamed, therefore we don't need to do it here. +		 */ +		sctx->send_progress = sctx->cur_ino + 1; +		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); +		if (ret < 0) +			goto out; +	}  out:  	return ret; @@ -4222,12 +5064,9 @@ static int changed_inode(struct send_ctx *sctx,  	u64 left_gen = 0;  	u64 right_gen = 0; -	ret = close_cur_inode_file(sctx); -	if (ret < 0) -		goto out; -  	sctx->cur_ino = key->objectid;  	sctx->cur_inode_new_gen = 0; +	sctx->cur_inode_last_extent = (u64)-1;  	/*  	 * Set send_progress to current inode. This will tell all get_cur_xxx @@ -4276,6 +5115,8 @@ static int changed_inode(struct send_ctx *sctx,  				sctx->left_path->nodes[0], left_ii);  		sctx->cur_inode_mode = btrfs_inode_mode(  				sctx->left_path->nodes[0], left_ii); +		sctx->cur_inode_rdev = btrfs_inode_rdev( +				sctx->left_path->nodes[0], left_ii);  		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)  			ret = send_create_inode_if_needed(sctx);  	} else if (result == BTRFS_COMPARE_TREE_DELETED) { @@ -4320,6 +5161,8 @@ static int changed_inode(struct send_ctx *sctx,  					sctx->left_path->nodes[0], left_ii);  			sctx->cur_inode_mode = btrfs_inode_mode(  					sctx->left_path->nodes[0], left_ii); +			sctx->cur_inode_rdev = btrfs_inode_rdev( +					sctx->left_path->nodes[0], left_ii);  			ret = send_create_inode_if_needed(sctx);  			if (ret < 0)  				goto out; @@ -4508,14 +5351,18 @@ static int changed_cb(struct btrfs_root *left_root,  	struct send_ctx *sctx = ctx;  	if (result == BTRFS_COMPARE_TREE_SAME) { -		if (key->type != BTRFS_INODE_REF_KEY && -		    key->type != BTRFS_INODE_EXTREF_KEY) -			return 0; -		ret = compare_refs(sctx, left_path, key); -		if (!ret) +		if (key->type == BTRFS_INODE_REF_KEY || +		    key->type == BTRFS_INODE_EXTREF_KEY) { +			ret = compare_refs(sctx, left_path, key); +			if (!ret) +				return 0; +			if (ret < 0) +				return ret; +		} else if (key->type == BTRFS_EXTENT_DATA_KEY) { +			return maybe_send_hole(sctx, left_path, key); +		} else {  			return 0; -		if (ret < 0) -			return ret; +		}  		result = BTRFS_COMPARE_TREE_CHANGED;  		ret = 0;  	} @@ -4550,57 +5397,21 @@ out:  static int full_send_tree(struct send_ctx *sctx)  {  	int ret; -	struct btrfs_trans_handle *trans = NULL;  	struct btrfs_root *send_root = sctx->send_root;  	struct btrfs_key key;  	struct btrfs_key found_key;  	struct btrfs_path *path;  	struct extent_buffer *eb;  	int slot; -	u64 start_ctransid; -	u64 ctransid;  	path = alloc_path_for_send();  	if (!path)  		return -ENOMEM; -	spin_lock(&send_root->root_item_lock); -	start_ctransid = btrfs_root_ctransid(&send_root->root_item); -	spin_unlock(&send_root->root_item_lock); -  	key.objectid = BTRFS_FIRST_FREE_OBJECTID;  	key.type = BTRFS_INODE_ITEM_KEY;  	key.offset = 0; -join_trans: -	/* -	 * We need to make sure the transaction does not get committed -	 * while we do anything on commit roots. Join a transaction to prevent -	 * this. -	 */ -	trans = btrfs_join_transaction(send_root); -	if (IS_ERR(trans)) { -		ret = PTR_ERR(trans); -		trans = NULL; -		goto out; -	} - -	/* -	 * Make sure the tree has not changed after re-joining. We detect this -	 * by comparing start_ctransid and ctransid. They should always match. -	 */ -	spin_lock(&send_root->root_item_lock); -	ctransid = btrfs_root_ctransid(&send_root->root_item); -	spin_unlock(&send_root->root_item_lock); - -	if (ctransid != start_ctransid) { -		WARN(1, KERN_WARNING "btrfs: the root that you're trying to " -				     "send was modified in between. This is " -				     "probably a bug.\n"); -		ret = -EIO; -		goto out; -	} -  	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);  	if (ret < 0)  		goto out; @@ -4608,19 +5419,6 @@ join_trans:  		goto out_finish;  	while (1) { -		/* -		 * When someone want to commit while we iterate, end the -		 * joined transaction and rejoin. -		 */ -		if (btrfs_should_end_transaction(trans, send_root)) { -			ret = btrfs_end_transaction(trans, send_root); -			trans = NULL; -			if (ret < 0) -				goto out; -			btrfs_release_path(path); -			goto join_trans; -		} -  		eb = path->nodes[0];  		slot = path->slots[0];  		btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -4648,12 +5446,6 @@ out_finish:  out:  	btrfs_free_path(path); -	if (trans) { -		if (!ret) -			ret = btrfs_end_transaction(trans, send_root); -		else -			btrfs_end_transaction(trans, send_root); -	}  	return ret;  } @@ -4686,15 +5478,25 @@ static int send_subvol(struct send_ctx *sctx)  	}  out: -	if (!ret) -		ret = close_cur_inode_file(sctx); -	else -		close_cur_inode_file(sctx); -  	free_recorded_refs(sctx);  	return ret;  } +static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) +{ +	spin_lock(&root->root_item_lock); +	root->send_in_progress--; +	/* +	 * Not much left to do, we don't know why it's unbalanced and +	 * can't blindly reset it to 0. +	 */ +	if (root->send_in_progress < 0) +		btrfs_err(root->fs_info, +			"send_in_progres unbalanced %d root %llu", +			root->send_in_progress, root->root_key.objectid); +	spin_unlock(&root->root_item_lock); +} +  long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  {  	int ret = 0; @@ -4706,6 +5508,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	struct send_ctx *sctx = NULL;  	u32 i;  	u64 *clone_sources_tmp = NULL; +	int clone_sources_to_rollback = 0; +	int sort_clone_roots = 0; +	int index;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -4714,38 +5519,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	fs_info = send_root->fs_info;  	/* +	 * The subvolume must remain read-only during send, protect against +	 * making it RW. This also protects against deletion. +	 */ +	spin_lock(&send_root->root_item_lock); +	send_root->send_in_progress++; +	spin_unlock(&send_root->root_item_lock); + +	/*  	 * This is done when we lookup the root, it should already be complete  	 * by the time we get here.  	 */  	WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);  	/* -	 * If we just created this root we need to make sure that the orphan -	 * cleanup has been done and committed since we search the commit root, -	 * so check its commit root transid with our otransid and if they match -	 * commit the transaction to make sure everything is updated. +	 * Userspace tools do the checks and warn the user if it's +	 * not RO.  	 */ -	down_read(&send_root->fs_info->extent_commit_sem); -	if (btrfs_header_generation(send_root->commit_root) == -	    btrfs_root_otransid(&send_root->root_item)) { -		struct btrfs_trans_handle *trans; - -		up_read(&send_root->fs_info->extent_commit_sem); - -		trans = btrfs_attach_transaction_barrier(send_root); -		if (IS_ERR(trans)) { -			if (PTR_ERR(trans) != -ENOENT) { -				ret = PTR_ERR(trans); -				goto out; -			} -			/* ENOENT means theres no transaction */ -		} else { -			ret = btrfs_commit_transaction(trans, send_root); -			if (ret) -				goto out; -		} -	} else { -		up_read(&send_root->fs_info->extent_commit_sem); +	if (!btrfs_root_readonly(send_root)) { +		ret = -EPERM; +		goto out;  	}  	arg = memdup_user(arg_, sizeof(*arg)); @@ -4756,8 +5549,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	}  	if (!access_ok(VERIFY_READ, arg->clone_sources, -			sizeof(*arg->clone_sources * -			arg->clone_sources_count))) { +			sizeof(*arg->clone_sources) * +			arg->clone_sources_count)) {  		ret = -EFAULT;  		goto out;  	} @@ -4786,9 +5579,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		goto out;  	} -	sctx->mnt = mnt_file->f_path.mnt; -  	sctx->send_root = send_root; +	/* +	 * Unlikely but possible, if the subvolume is marked for deletion but +	 * is slow to remove the directory entry, send can still be started +	 */ +	if (btrfs_root_dead(sctx->send_root)) { +		ret = -EPERM; +		goto out; +	} +  	sctx->clone_roots_cnt = arg->clone_sources_count;  	sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -4804,6 +5604,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		goto out;  	} +	sctx->pending_dir_moves = RB_ROOT; +	sctx->waiting_dir_moves = RB_ROOT; +	sctx->orphan_dirs = RB_ROOT; +  	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *  			(arg->clone_sources_count + 1));  	if (!sctx->clone_roots) { @@ -4831,11 +5635,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  			key.objectid = clone_sources_tmp[i];  			key.type = BTRFS_ROOT_ITEM_KEY;  			key.offset = (u64)-1; + +			index = srcu_read_lock(&fs_info->subvol_srcu); +  			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);  			if (IS_ERR(clone_root)) { +				srcu_read_unlock(&fs_info->subvol_srcu, index);  				ret = PTR_ERR(clone_root);  				goto out;  			} +			clone_sources_to_rollback = i + 1; +			spin_lock(&clone_root->root_item_lock); +			clone_root->send_in_progress++; +			if (!btrfs_root_readonly(clone_root)) { +				spin_unlock(&clone_root->root_item_lock); +				srcu_read_unlock(&fs_info->subvol_srcu, index); +				ret = -EPERM; +				goto out; +			} +			spin_unlock(&clone_root->root_item_lock); +			srcu_read_unlock(&fs_info->subvol_srcu, index); +  			sctx->clone_roots[i].root = clone_root;  		}  		vfree(clone_sources_tmp); @@ -4846,11 +5666,28 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  		key.objectid = arg->parent_root;  		key.type = BTRFS_ROOT_ITEM_KEY;  		key.offset = (u64)-1; + +		index = srcu_read_lock(&fs_info->subvol_srcu); +  		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);  		if (IS_ERR(sctx->parent_root)) { +			srcu_read_unlock(&fs_info->subvol_srcu, index);  			ret = PTR_ERR(sctx->parent_root);  			goto out;  		} + +		spin_lock(&sctx->parent_root->root_item_lock); +		sctx->parent_root->send_in_progress++; +		if (!btrfs_root_readonly(sctx->parent_root) || +				btrfs_root_dead(sctx->parent_root)) { +			spin_unlock(&sctx->parent_root->root_item_lock); +			srcu_read_unlock(&fs_info->subvol_srcu, index); +			ret = -EPERM; +			goto out; +		} +		spin_unlock(&sctx->parent_root->root_item_lock); + +		srcu_read_unlock(&fs_info->subvol_srcu, index);  	}  	/* @@ -4864,8 +5701,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	sort(sctx->clone_roots, sctx->clone_roots_cnt,  			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,  			NULL); +	sort_clone_roots = 1; +	current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;  	ret = send_subvol(sctx); +	current->journal_info = NULL;  	if (ret < 0)  		goto out; @@ -4879,6 +5719,58 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)  	}  out: +	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); +	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { +		struct rb_node *n; +		struct pending_dir_move *pm; + +		n = rb_first(&sctx->pending_dir_moves); +		pm = rb_entry(n, struct pending_dir_move, node); +		while (!list_empty(&pm->list)) { +			struct pending_dir_move *pm2; + +			pm2 = list_first_entry(&pm->list, +					       struct pending_dir_move, list); +			free_pending_move(sctx, pm2); +		} +		free_pending_move(sctx, pm); +	} + +	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); +	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { +		struct rb_node *n; +		struct waiting_dir_move *dm; + +		n = rb_first(&sctx->waiting_dir_moves); +		dm = rb_entry(n, struct waiting_dir_move, node); +		rb_erase(&dm->node, &sctx->waiting_dir_moves); +		kfree(dm); +	} + +	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); +	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { +		struct rb_node *n; +		struct orphan_dir_info *odi; + +		n = rb_first(&sctx->orphan_dirs); +		odi = rb_entry(n, struct orphan_dir_info, node); +		free_orphan_dir_info(sctx, odi); +	} + +	if (sort_clone_roots) { +		for (i = 0; i < sctx->clone_roots_cnt; i++) +			btrfs_root_dec_send_in_progress( +					sctx->clone_roots[i].root); +	} else { +		for (i = 0; sctx && i < clone_sources_to_rollback; i++) +			btrfs_root_dec_send_in_progress( +					sctx->clone_roots[i].root); + +		btrfs_root_dec_send_in_progress(send_root); +	} +	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) +		btrfs_root_dec_send_in_progress(sctx->parent_root); +  	kfree(arg);  	vfree(clone_sources_tmp);  | 
