diff options
author | Joel Becker <jlbec@evilplan.org> | 2011-05-25 21:51:55 -0700 |
---|---|---|
committer | Joel Becker <jlbec@evilplan.org> | 2011-05-25 21:51:55 -0700 |
commit | ece928df16494becd43f999aff9bd530182e7e81 (patch) | |
tree | 905042764ea5d8ab6eda63666406e19f607bcf4c /fs/ocfs2 | |
parent | 3d1c1829ebe7e8bb48a997b39b4865abc9197e5e (diff) | |
parent | dda54e76d7dba0532ebdd72e0b4f492a03f83225 (diff) |
Merge branch 'move_extents' of git://oss.oracle.com/git/tye/linux-2.6 into ocfs2-merge-window
Conflicts:
fs/ocfs2/ioctl.c
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/Makefile | 1 | ||||
-rw-r--r-- | fs/ocfs2/ioctl.c | 468 | ||||
-rw-r--r-- | fs/ocfs2/move_extents.c | 1153 | ||||
-rw-r--r-- | fs/ocfs2/move_extents.h | 22 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_ioctl.h | 68 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.c | 58 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.h | 11 |
7 files changed, 1725 insertions, 56 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index d8a0313e99e..f17e58b3298 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -30,6 +30,7 @@ ocfs2-objs := \ namei.o \ refcounttree.o \ reservations.o \ + move_extents.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 312a28f433a..bc91072b721 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -22,6 +22,11 @@ #include "ioctl.h" #include "resize.h" #include "refcounttree.h" +#include "sysfile.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "suballoc.h" +#include "move_extents.h" #include <linux/ext2_fs.h> @@ -35,31 +40,27 @@ * be -EFAULT. The error will be returned from the ioctl(2) call. It's * just a best-effort to tell userspace that this request caused the error. */ -static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, +static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } -#define o2info_set_request_error(a, b) \ - __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) - -static inline void __o2info_set_request_filled(struct ocfs2_info_request *req) +static inline void o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } -#define o2info_set_request_filled(a) \ - __o2info_set_request_filled((struct ocfs2_info_request *)&(a)) - -static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req) +static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } -#define o2info_clear_request_filled(a) \ - __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) +static inline int o2info_coherent(struct ocfs2_info_request *req) +{ + return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); +} static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { @@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, oib.ib_blocksize = inode->i_sb->s_blocksize; - o2info_set_request_filled(oib); + o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) goto bail; @@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oib, req); + o2info_set_request_error(&oib.ib_req, req); return status; } @@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, oic.ic_clustersize = osb->s_clustersize; - o2info_set_request_filled(oic); + o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) goto bail; @@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oic, req); + o2info_set_request_error(&oic.ic_req, req); return status; } @@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, oim.im_max_slots = osb->max_slots; - o2info_set_request_filled(oim); + o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) goto bail; @@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oim, req); + o2info_set_request_error(&oim.im_req, req); return status; } @@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode, memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); - o2info_set_request_filled(oil); + o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) goto bail; @@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oil, req); + o2info_set_request_error(&oil.il_req, req); return status; } @@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); - o2info_set_request_filled(oiu); + o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) goto bail; @@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oiu, req); + o2info_set_request_error(&oiu.iu_req, req); return status; } @@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; - o2info_set_request_filled(oif); + o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) goto bail; @@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oif, req); + o2info_set_request_error(&oif.if_req, req); return status; } @@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, oij.ij_journal_size = osb->journal->j_inode->i_size; - o2info_set_request_filled(oij); + o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) goto bail; @@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oij, req); + o2info_set_request_error(&oij.ij_req, req); + + return status; +} + +int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, u32 slot) +{ + int status = 0, unlock = 0; + + struct buffer_head *bh = NULL; + struct ocfs2_dinode *dinode_alloc = NULL; + + if (inode_alloc) + mutex_lock(&inode_alloc->i_mutex); + + if (o2info_coherent(&fi->ifi_req)) { + status = ocfs2_inode_lock(inode_alloc, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + dinode_alloc = (struct ocfs2_dinode *)bh->b_data; + + fi->ifi_stat[slot].lfi_total = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); + fi->ifi_stat[slot].lfi_free = + le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - + le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); + +bail: + if (unlock) + ocfs2_inode_unlock(inode_alloc, 0); + + if (inode_alloc) + mutex_unlock(&inode_alloc->i_mutex); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u32 i; + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + struct ocfs2_info_freeinode *oifi = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *inode_alloc = NULL; + + oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); + if (!oifi) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oifi, req)) + goto bail; + + oifi->ifi_slotnum = osb->max_slots; + + for (i = 0; i < oifi->ifi_slotnum; i++) { + if (o2info_coherent(&oifi->ifi_req)) { + inode_alloc = ocfs2_get_system_file_inode(osb, type, i); + if (!inode_alloc) { + mlog(ML_ERROR, "unable to get alloc inode in " + "slot %u\n", i); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); + if (status < 0) + goto bail; + + iput(inode_alloc); + inode_alloc = NULL; + } + + o2info_set_request_filled(&oifi->ifi_req); + + if (o2info_to_user(*oifi, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oifi->ifi_req, req); + + kfree(oifi); + + return status; +} + +static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, + unsigned int chunksize) +{ + int index; + + index = __ilog2_u32(chunksize); + if (index >= OCFS2_INFO_MAX_HIST) + index = OCFS2_INFO_MAX_HIST - 1; + + hist->fc_chunks[index]++; + hist->fc_clusters[index] += chunksize; +} + +static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, + unsigned int chunksize) +{ + if (chunksize > stats->ffs_max) + stats->ffs_max = chunksize; + + if (chunksize < stats->ffs_min) + stats->ffs_min = chunksize; + + stats->ffs_avg += chunksize; + stats->ffs_free_chunks_real++; +} + +void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) +{ + o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); + o2ffg_update_stats(&(ffg->iff_ffs), chunksize); +} + +int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) +{ + int status = 0, used; + u64 blkno; + + struct buffer_head *bh = NULL; + struct ocfs2_group_desc *bg = NULL; + + unsigned int max_bits, num_clusters; + unsigned int offset = 0, cluster, chunk; + unsigned int chunk_free, last_chunksize = 0; + + if (!le32_to_cpu(rec->c_free)) + goto bail; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (bh) { + brelse(bh); + bh = NULL; + } + + if (o2info_coherent(&ffg->iff_req)) + status = ocfs2_read_group_descriptor(gb_inode, + gb_dinode, + blkno, &bh); + else + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + + if (status < 0) { + mlog(ML_ERROR, "Can't read the group descriptor # " + "%llu from device.", (unsigned long long)blkno); + status = -EIO; + goto bail; + } + + bg = (struct ocfs2_group_desc *)bh->b_data; + + if (!le16_to_cpu(bg->bg_free_bits_count)) + continue; + + max_bits = le16_to_cpu(bg->bg_bits); + offset = 0; + + for (chunk = 0; chunk < chunks_in_group; chunk++) { + /* + * last chunk may be not an entire one. + */ + if ((offset + ffg->iff_chunksize) > max_bits) + num_clusters = max_bits - offset; + else + num_clusters = ffg->iff_chunksize; + + chunk_free = 0; + for (cluster = 0; cluster < num_clusters; cluster++) { + used = ocfs2_test_bit(offset, + (unsigned long *)bg->bg_bitmap); + /* + * - chunk_free counts free clusters in #N chunk. + * - last_chunksize records the size(in) clusters + * for the last real free chunk being counted. + */ + if (!used) { + last_chunksize++; + chunk_free++; + } + + if (used && last_chunksize) { + ocfs2_info_update_ffg(ffg, + last_chunksize); + last_chunksize = 0; + } + + offset++; + } + + if (chunk_free == ffg->iff_chunksize) + ffg->iff_ffs.ffs_free_chunks++; + } + + /* + * need to update the info for last free chunk. + */ + if (last_chunksize) + ocfs2_info_update_ffg(ffg, last_chunksize); + + } while (le64_to_cpu(bg->bg_next_group)); + +bail: + brelse(bh); + + return status; +} + +int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) +{ + u32 chunks_in_group; + int status = 0, unlock = 0, i; + + struct buffer_head *bh = NULL; + struct ocfs2_chain_list *cl = NULL; + struct ocfs2_chain_rec *rec = NULL; + struct ocfs2_dinode *gb_dinode = NULL; + + if (gb_inode) + mutex_lock(&gb_inode->i_mutex); + + if (o2info_coherent(&ffg->iff_req)) { + status = ocfs2_inode_lock(gb_inode, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + unlock = 1; + } else { + status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + gb_dinode = (struct ocfs2_dinode *)bh->b_data; + cl = &(gb_dinode->id2.i_chain); + + /* + * Chunksize(in) clusters from userspace should be + * less than clusters in a group. + */ + if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { + status = -EINVAL; + goto bail; + } + + memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); + + ffg->iff_ffs.ffs_min = ~0U; + ffg->iff_ffs.ffs_clusters = + le32_to_cpu(gb_dinode->id1.bitmap1.i_total); + ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - + le32_to_cpu(gb_dinode->id1.bitmap1.i_used); + + chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + rec = &(cl->cl_recs[i]); + status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, + gb_dinode, + rec, ffg, + chunks_in_group); + if (status) + goto bail; + } + + if (ffg->iff_ffs.ffs_free_chunks_real) + ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / + ffg->iff_ffs.ffs_free_chunks_real); +bail: + if (unlock) + ocfs2_inode_unlock(gb_inode, 0); + + if (gb_inode) + mutex_unlock(&gb_inode->i_mutex); + + if (gb_inode) + iput(gb_inode); + + brelse(bh); + + return status; +} + +int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + u64 blkno = -1; + char namebuf[40]; + int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + + struct ocfs2_info_freefrag *oiff; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *gb_inode = NULL; + + oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); + if (!oiff) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + if (o2info_from_user(*oiff, req)) + goto bail; + /* + * chunksize from userspace should be power of 2. + */ + if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || + (!oiff->iff_chunksize)) { + status = -EINVAL; + goto bail; + } + + if (o2info_coherent(&oiff->iff_req)) { + gb_inode = ocfs2_get_system_file_inode(osb, type, + OCFS2_INVALID_SLOT); + if (!gb_inode) { + mlog(ML_ERROR, "unable to get global_bitmap inode\n"); + status = -EIO; + goto bail; + } + } else { + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, + OCFS2_INVALID_SLOT); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, + strlen(namebuf), + &blkno); + if (status < 0) { + status = -ENOENT; + goto bail; + } + } + + status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); + if (status < 0) + goto bail; + + o2info_set_request_filled(&oiff->iff_req); + + if (o2info_to_user(*oiff, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(&oiff->iff_req, req); + + kfree(oiff); return status; } @@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, if (o2info_from_user(oir, req)) goto bail; - o2info_clear_request_filled(oir); + o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) goto bail; @@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode, status = 0; bail: if (status) - o2info_set_request_error(oir, req); + o2info_set_request_error(&oir, req); return status; } @@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode, if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; + case OCFS2_INFO_FREEINODE: + if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) + status = ocfs2_info_handle_freeinode(inode, req); + break; + case OCFS2_INFO_FREEFRAG: + if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) + status = ocfs2_info_handle_freefrag(inode, req); + break; default: status = ocfs2_info_handle_unknown(inode, req); break; @@ -565,6 +975,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return 0; } + case OCFS2_IOC_MOVE_EXT: + return ocfs2_ioctl_move_extents(filp, (void __user *)arg); default: return -ENOTTY; } @@ -608,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); + case OCFS2_IOC_MOVE_EXT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c new file mode 100644 index 00000000000..4c5488468c1 --- /dev/null +++ b/fs/ocfs2/move_extents.c @@ -0,0 +1,1153 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * move_extents.c + * + * Copyright (C) 2011 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/mount.h> +#include <linux/swap.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "ocfs2_ioctl.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "suballoc.h" +#include "uptodate.h" +#include "super.h" +#include "dir.h" +#include "buffer_head_io.h" +#include "sysfile.h" +#include "suballoc.h" +#include "refcounttree.h" +#include "move_extents.h" + +struct ocfs2_move_extents_context { + struct inode *inode; + struct file *file; + int auto_defrag; + int partial; + int credits; + u32 new_phys_cpos; + u32 clusters_moved; + u64 refcount_loc; + struct ocfs2_move_extents *range; + struct ocfs2_extent_tree et; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; +}; + +static int __ocfs2_move_extent(handle_t *handle, + struct ocfs2_move_extents_context *context, + u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, + int ext_flags) +{ + int ret = 0, index; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec *rec, replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); + u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); + + ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + p_cpos, new_p_cpos, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + new_p_cpos)); + + path = ocfs2_new_path_from_et(&context->et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(inode->i_sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + rec = &el->l_recs[index]; + + BUG_ON(ext_flags != rec->e_flags); + /* + * after moving/defraging to new location, the extent is not going + * to be refcounted anymore. + */ + replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + context->et.et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_split_extent(handle, &context->et, path, index, + &replace_rec, context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, context->et.et_root_bh); + + context->new_phys_cpos = new_p_cpos; + + /* + * need I to append truncate log for old clusters? + */ + if (old_blkno) { + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + old_blkno), + len, context->meta_ac, + &context->dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + old_blkno, len); + } + +out: + return ret; +} + +/* + * lock allocators, and reserving appropriate number of bits for + * meta blocks and data clusters. + * + * in some cases, we don't need to reserve clusters, just let data_ac + * be NULL. + */ +static int ocfs2_lock_allocators_move_extents(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_move, + u32 extents_to_split, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int extra_blocks, + int *credits) +{ + int ret, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, + clusters_to_move + 2); + + mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", + extra_blocks, clusters_to_move, *credits); +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +/* + * Using one journal handle to guarantee the data consistency in case + * crash happens anywhere. + * + * XXX: defrag can end up with finishing partial extent as requested, + * due to not enough contiguous clusters can be found in allocator. + */ +static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, + u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) +{ + int ret, credits = 0, extra_blocks = 0, partial = context->partial; + handle_t *handle; + struct inode *inode = context->inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_refcount_tree *ref_tree = NULL; + u32 new_phys_cpos, new_len; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + + if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + BUG_ON(!context->refcount_loc); + + ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + context->refcount_loc, + phys_blkno, + *len, + &credits, + &extra_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, + &context->meta_ac, + &context->data_ac, + extra_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * should be using allocation reservation strategy there? + * + * if (context->data_ac) + * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + */ + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock_mutex; + } + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock_mutex; + } + + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, + &new_phys_cpos, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * allowing partial extent moving is kind of 'pros and cons', it makes + * whole defragmentation less likely to fail, on the contrary, the bad + * thing is it may make the fs even more fragmented after moving, let + * userspace make a good decision here. + */ + if (new_len != *len) { + mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); + if (!partial) { + context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; + ret = -ENOSPC; + goto out_commit; + } + } + + mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, + phys_cpos, new_phys_cpos); + + ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, + new_phys_cpos, ext_flags); + if (ret) + mlog_errno(ret); + + if (partial && (new_len != *len)) + *len = new_len; + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_unlock_mutex: + mutex_unlock(&tl_inode->i_mutex); + + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + +out: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + +/* + * find the victim alloc group, where #blkno fits. + */ +static int ocfs2_find_victim_alloc_group(struct inode *inode, + u64 vict_blkno, + int type, int slot, + int *vict_bit, + struct buffer_head **ret_bh) +{ + int ret, i, blocks_per_unit = 1; + u64 blkno; + char namebuf[40]; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ac_bh = NULL, *gd_bh = NULL; + struct ocfs2_chain_list *cl; + struct ocfs2_chain_rec *rec; + struct ocfs2_dinode *ac_dinode; + struct ocfs2_group_desc *bg; + + ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, + strlen(namebuf), &blkno); + if (ret) { + ret = -ENOENT; + goto out; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; + cl = &(ac_dinode->id2.i_chain); + rec = &(cl->cl_recs[0]); + + if (type == GLOBAL_BITMAP_SYSTEM_INODE) + blocks_per_unit <<= (osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits); + /* + * 'vict_blkno' was out of the valid range. + */ + if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || + (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) * + blocks_per_unit))) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { + + rec = &(cl->cl_recs[i]); + if (!rec) + continue; + + bg = NULL; + + do { + if (!bg) + blkno = le64_to_cpu(rec->c_blkno); + else + blkno = le64_to_cpu(bg->bg_next_group); + + if (gd_bh) { + brelse(gd_bh); + gd_bh = NULL; + } + + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + + le16_to_cpu(bg->bg_bits))) { + + *ret_bh = gd_bh; + *vict_bit = (vict_blkno - blkno) / + blocks_per_unit; + mlog(0, "find the victim group: #%llu, " + "total_bits: %u, vict_bit: %u\n", + blkno, le16_to_cpu(bg->bg_bits), + *vict_bit); + goto out; + } + + } while (le64_to_cpu(bg->bg_next_group)); + } + + ret = -EINVAL; +out: + brelse(ac_bh); + + /* + * caller has to release the gd_bh properly. + */ + return ret; +} + +/* + * XXX: helper to validate and adjust moving goal. + */ +static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, + struct ocfs2_move_extents *range) +{ + int ret, goal_bit = 0; + + struct buffer_head *gd_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int c_to_b = 1 << (osb->s_clustersize_bits - + inode->i_sb->s_blocksize_bits); + + /* + * validate goal sits within global_bitmap, and return the victim + * group desc + */ + ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT, + &goal_bit, &gd_bh); + if (ret) + goto out; + + bg = (struct ocfs2_group_desc *)gd_bh->b_data; + + /* + * make goal become cluster aligned. + */ + if (range->me_goal % c_to_b) + range->me_goal = range->me_goal / c_to_b * c_to_b; + + /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* + * movement is not gonna cross two groups. + */ + if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < + rang |