aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2/extent_map.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/extent_map.c')
-rw-r--r--fs/ocfs2/extent_map.c1233
1 files changed, 372 insertions, 861 deletions
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9..ba2b2ab1c6e 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
*
* extent_map.c
*
- * In-memory extent map for OCFS2. Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
@@ -26,1016 +25,528 @@
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "alloc.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
#include "buffer_head_io.h"
-
/*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-
-struct ocfs2_extent_map_entry {
- struct rb_node e_node;
- int e_tree_depth;
- struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
- int need_left;
- int need_right;
- struct ocfs2_extent_map_entry *new_ent;
- struct ocfs2_extent_map_entry *old_ent;
- struct ocfs2_extent_map_entry *left_ent;
- struct ocfs2_extent_map_entry *right_ent;
-};
-
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
- u32 cpos, u32 clusters)
-{
- if (le32_to_cpu(rec->e_cpos) > cpos)
- return 0;
- if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters))
- return 0;
- return 1;
-}
-
-
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock. This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
+ * The extent caching implementation is intentionally trivial.
*
- * The rb_node garbage lets insertion share the search. Trivial
- * callers pass NULL.
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
*/
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent)
+
+void ocfs2_extent_map_init(struct inode *inode)
{
- struct rb_node **p = &em->em_extents.rb_node;
- struct rb_node *parent = NULL;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- while (*p)
- {
- parent = *p;
- ent = rb_entry(parent, struct ocfs2_extent_map_entry,
- e_node);
- if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
- p = &(*p)->rb_left;
- ent = NULL;
- } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
- le32_to_cpu(ent->e_rec.e_clusters))) {
- p = &(*p)->rb_right;
- ent = NULL;
- } else
- break;
- }
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
- return ent;
+ oi->ip_extent_map.em_num_items = 0;
+ INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
}
-/*
- * Find the leaf containing the interval we want. While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock. That's because it
- * sleeps while reading. If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el)
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+ unsigned int cpos,
+ struct ocfs2_extent_map_item **ret_emi)
{
- int i, ret;
- struct buffer_head *eb_bh = NULL;
- u64 blkno;
- u32 rec_end;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_rec *rec;
-
- /*
- * The bh data containing the el cannot change here, because
- * we hold alloc_sem. So we can do this without other
- * locks.
- */
- while (el->l_tree_depth)
- {
- blkno = 0;
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
- rec_end = (le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
-
- ret = -EBADR;
- if (rec_end > OCFS2_I(inode)->ip_clusters) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- goto out_free;
- }
-
- if (rec_end <= cpos) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
- if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
+ unsigned int range;
+ struct ocfs2_extent_map_item *emi;
- /*
- * We've found a record that matches our
- * interval. We don't insert it because we're
- * about to traverse it.
- */
-
- /* Check to see if we're stradling */
- ret = -ESRCH;
- if (!ocfs2_extent_rec_contains_clusters(rec,
- cpos,
- clusters)) {
- mlog_errno(ret);
- goto out_free;
- }
+ *ret_emi = NULL;
- /*
- * If we've already found a record, the el has
- * two records covering the same interval.
- * EEEK!
- */
- ret = -EBADR;
- if (blkno) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, i,
- (unsigned long long)le64_to_cpu(rec->e_blkno));
- goto out_free;
- }
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ range = emi->ei_cpos + emi->ei_clusters;
- blkno = le64_to_cpu(rec->e_blkno);
- }
+ if (cpos >= emi->ei_cpos && cpos < range) {
+ list_move(&emi->ei_list, &em->em_list);
- /*
- * We don't support holes, and we're still up
- * in the branches, so we'd better have found someone
- */
- ret = -EBADR;
- if (!blkno) {
- ocfs2_error(inode->i_sb,
- "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_errno(ret);
- goto out_free;
- }
-
- if (eb_bh) {
- brelse(eb_bh);
- eb_bh = NULL;
- }
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- blkno, &eb_bh, OCFS2_BH_CACHED,
- inode);
- if (ret) {
- mlog_errno(ret);
- goto out_free;
- }
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EIO;
- goto out_free;
+ *ret_emi = emi;
+ break;
}
- el = &eb->h_list;
}
+}
- BUG_ON(el->l_tree_depth);
-
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
-
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- return ret;
- }
-
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+ unsigned int *phys, unsigned int *len,
+ unsigned int *flags)
+{
+ unsigned int coff;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map_item *emi;
+
+ spin_lock(&oi->ip_lock);
+
+ __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+ if (emi) {
+ coff = cpos - emi->ei_cpos;
+ *phys = emi->ei_phys + coff;
+ if (len)
+ *len = emi->ei_clusters - coff;
+ if (flags)
+ *flags = emi->ei_flags;
}
- ret = 0;
+ spin_unlock(&oi->ip_lock);
-out_free:
- if (eb_bh)
- brelse(eb_bh);
+ if (emi == NULL)
+ return -ENOENT;
- return ret;
+ return 0;
}
/*
- * This lookup actually will read from disk. It has one invariant:
- * It will never re-traverse blocks. This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
+ * Forget about all clusters equal to or greater than cpos.
*/
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos,
- u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent)
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
- int ret;
- u64 blkno;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct buffer_head *bh = NULL;
- struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di;
- struct ocfs2_extent_list *el;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (ent) {
- if (!ent->e_tree_depth) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- *ret_ent = ent;
- return 0;
- }
- blkno = le64_to_cpu(ent->e_rec.e_blkno);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
+ struct list_head *p, *n;
+ struct ocfs2_extent_map_item *emi;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ LIST_HEAD(tmp_list);
+ unsigned int range;
+
+ spin_lock(&oi->ip_lock);
+ list_for_each_safe(p, n, &em->em_list) {
+ emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+
+ if (emi->ei_cpos >= cpos) {
+ /* Full truncate of this record. */
+ list_move(&emi->ei_list, &tmp_list);
+ BUG_ON(em->em_num_items == 0);
+ em->em_num_items--;
+ continue;
}
- eb = (struct ocfs2_extent_block *)bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- brelse(bh);
- return -EIO;
- }
- el = &eb->h_list;
- } else {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
+ range = emi->ei_cpos + emi->ei_clusters;
+ if (range > cpos) {
+ /* Partial truncate */
+ emi->ei_clusters = cpos - emi->ei_cpos;
}
- di = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- brelse(bh);
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
- return -EIO;
- }
- el = &di->id2.i_list;
- }
-
- ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
- brelse(bh);
- if (ret) {
- mlog_errno(ret);
- return ret;
}
+ spin_unlock(&oi->ip_lock);
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (!ent) {
- ret = -ESRCH;
- mlog_errno(ret);
- return ret;
+ list_for_each_safe(p, n, &tmp_list) {
+ emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+ list_del(&emi->ei_list);
+ kfree(emi);
}
-
- /* FIXME: Make sure this isn't a corruption */
- BUG_ON(ent->e_tree_depth);
-
- *ret_ent = ent;
-
- return 0;
}
/*
- * Callers must hold ip_lock. This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
+ * Is any part of emi2 contained within emi1
*/
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent)
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+ struct ocfs2_extent_map_item *emi2)
{
- struct rb_node **p, *parent;
- struct ocfs2_extent_map_entry *old_ent;
+ unsigned int range1, range2;
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
- le32_to_cpu(ent->e_rec.e_clusters),
- &p, &parent);
- if (old_ent)
- return -EEXIST;
+ /*
+ * Check if logical start of emi2 is inside emi1
+ */
+ range1 = emi1->ei_cpos + emi1->ei_clusters;
+ if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+ return 1;
- rb_link_node(&ent->e_node, parent, p);
- rb_insert_color(&ent->e_node, &em->em_extents);
+ /*
+ * Check if logical end of emi2 is inside emi1
+ */
+ range2 = emi2->ei_cpos + emi2->ei_clusters;
+ if (range2 > emi1->ei_cpos && range2 <= range1)
+ return 1;
return 0;
}
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+ struct ocfs2_extent_map_item *src)
+{
+ dest->ei_cpos = src->ei_cpos;
+ dest->ei_phys = src->ei_phys;
+ dest->ei_clusters = src->ei_clusters;
+ dest->ei_flags = src->ei_flags;
+}
/*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert. It is not an actual error, but it
- * tells the caller we have no more work to do.
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
*/
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt)
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+ struct ocfs2_extent_map_item *ins)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *old_ent;
-
- ctxt->need_left = 0;
- ctxt->need_right = 0;
- ctxt->old_ent = NULL;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
- if (!ret) {
- ctxt->new_ent = NULL;
- goto out_unlock;
- }
-
- /* Since insert_entry failed, the map MUST have old_ent */
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
- le32_to_cpu(rec->e_clusters),
- NULL, NULL);
-
- BUG_ON(!old_ent);
-
- if (old_ent->e_tree_depth < tree_depth) {
- /* Another thread beat us to the lower tree_depth */
- ret = -EEXIST;
- goto out_unlock;
- }
-
- if (old_ent->e_tree_depth == tree_depth) {
- /*
- * Another thread beat us to this tree_depth.
- * Let's make sure we agree with that thread (the
- * extent_rec should be identical).
- */
- if (!memcmp(rec, &old_ent->e_rec,
- sizeof(struct ocfs2_extent_rec)))
- ret = 0;
- else
- /* FIXME: Should this be ESRCH/EBADR??? */
- ret = -EEXIST;
-
- goto out_unlock;
- }
-
/*
- * We do it in this order specifically so that no actual tree
- * changes occur until we have all the pieces we need. We
- * don't want malloc failures to leave an inconsistent tree.
- * Whenever we drop the lock, another process could be
- * inserting. Also note that, if another process just beat us
- * to an insert, we might not need the same pieces we needed
- * the first go round. In the end, the pieces we need will
- * be used, and the pieces we don't will be freed.
+ * Handle contiguousness
*/
- ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
- le32_to_cpu(old_ent->e_rec.e_cpos));
- ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) >
- (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
- ret = -EAGAIN;
- if (ctxt->need_left) {
- if (!ctxt->left_ent)
- goto out_unlock;
- *(ctxt->left_ent) = *old_ent;
- ctxt->left_ent->e_rec.e_clusters =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) -
- le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
- }
- if (ctxt->need_right) {
- if (!ctxt->right_ent)
- goto out_unlock;
- *(ctxt->right_ent) = *old_ent;
- ctxt->right_ent->e_rec.e_cpos =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
- ctxt->right_ent->e_rec.e_clusters =
- cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) -
- le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
- }
-
- rb_erase(&old_ent->e_node, &em->em_extents);
- /* Now that he's erased, set him up for deletion */
- ctxt->old_ent = old_ent;
-
- if (ctxt->need_left) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->left_ent);
- if (ret)
- goto out_unlock;
- ctxt->left_ent = NULL;
+ if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+ ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
+ } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+ (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_phys = ins->ei_phys;
+ emi->ei_cpos = ins->ei_cpos;
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
}
- if (ctxt->need_right) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->right_ent);
- if (ret)
- goto out_unlock;
- ctxt->right_ent = NULL;
+ /*
+ * Overlapping extents - this shouldn't happen unless we've
+ * split an extent to change it's flags. That is exceedingly
+ * rare, so there's no sense in trying to optimize it yet.
+ */
+ if (ocfs2_ei_is_contained(emi, ins) ||
+ ocfs2_ei_is_contained(ins, emi)) {
+ ocfs2_copy_emi_fields(emi, ins);
+ return 1;
}
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
- if (!ret)
- ctxt->new_ent = NULL;
-
-out_unlock:
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- return ret;
+ /* No merge was possible. */
+ return 0;
}
-
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth)
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+ struct ocfs2_extent_rec *rec)
{
- int ret;
- struct ocfs2_em_insert_context ctxt = {0, };
-
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- return ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+ struct ocfs2_extent_map_item ins;
+
+ ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+ ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ ins.ei_flags = rec->e_flags;
+
+search:
+ spin_lock(&oi->ip_lock);
+
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+ list_move(&emi->ei_list, &em->em_list);
+ spin_unlock(&oi->ip_lock);
+ goto out;
+ }
}
- /* Zero e_clusters means a truncated tail record. It better be EOF */
- if (!rec->e_clusters) {
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- return ret;
- }
+ /*
+ * No item could be merged.
+ *
+ * Either allocate and add a new item, or overwrite the last recently
+ * inserted.
+ */
- /* Ignore the truncated tail */
- return 0;
- }
+ if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+ if (new_emi == NULL) {
+ spin_unlock(&oi->ip_lock);
- ret = -ENOMEM;
- ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.new_ent) {
- mlog_errno(ret);
- return ret;
- }
+ new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+ if (new_emi == NULL)
+ goto out;
- ctxt.new_ent->e_rec = *rec;
- ctxt.new_ent->e_tree_depth = tree_depth;
-
- do {
- ret = -ENOMEM;
- if (ctxt.need_left && !ctxt.left_ent) {
- ctxt.left_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.left_ent)
- break;
- }
- if (ctxt.need_right && !ctxt.right_ent) {
- ctxt.right_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.right_ent)
- break;
+ goto search;
}
- ret = ocfs2_extent_map_try_insert(inode, rec,
- tree_depth, &ctxt);
- } while (ret == -EAGAIN);
-
- if ((ret < 0) && (ret != -EEXIST))
- mlog_errno(ret);
+ ocfs2_copy_emi_fields(new_emi, &ins);
+ list_add(&new_emi->ei_list, &em->em_list);
+ em->em_num_items++;
+ new_emi = NULL;
+ } else {
+ BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+ emi = list_entry(em->em_list.prev,
+ struct ocfs2_extent_map_item, ei_list);
+ list_move(&emi->ei_list, &em->em_list);
+ ocfs2_copy_emi_fields(emi, &ins);
+ }
- if (ctxt.left_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
- if (ctxt.right_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
- if (ctxt.old_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
- if (ctxt.new_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+ spin_unlock(&oi->ip_lock);
- return ret;
+out:
+ if (new_emi)
+ kfree(new_emi);
}
/*
- * Append this record to the tail of the extent map. It must be
- * tree_depth 0. The record might be an extension of an existing
- * record, and as such that needs to be handled. eg:
- *
- * Existing record in the extent map:
- *
- * cpos = 10, len = 10
- * |---------|
- *
- * New Record:
- *
- * cpos = 10, len = 20
- * |------------------|
- *
- * The passed record is the new on-disk record. The new_clusters value
- * is how many clusters were added to the file. If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters. If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
*/
-int ocfs2_extent_map_append(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- u32 new_clusters)
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+ u32 v_cluster)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct ocfs2_extent_rec *old;
-
- BUG_ON(!new_clusters);
- BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+ int i;
+ struct ocfs2_extent_rec *rec;
- if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
- mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (em->em_clusters + new_clusters),
- "Inode %llu:\n"
- "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
- "em->em_clusters = %u + new_clusters = %u = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
- le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
- em->em_clusters, new_clusters,
- em->em_clusters + new_clusters);
-
- em->em_clusters += new_clusters;
-
- ret = -ENOENT;
- if (le32_to_cpu(rec->e_clusters) > new_clusters) {
- /* This is a contiguous append */
- ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
- NULL, NULL);
- if (ent) {
- old = &ent->e_rec;
- BUG_ON((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (le32_to_cpu(old->e_cpos) +
- le32_to_cpu(old->e_clusters) +
- new_clusters));
- if (ent->e_tree_depth == 0) {
- BUG_ON(le32_to_cpu(old->e_cpos) !=
- le32_to_cpu(rec->e_cpos));
- BUG_ON(le64_to_cpu(old->e_blkno) !=
- le64_to_cpu(rec->e_blkno));
- ret = 0;
- }
- /*
- * Let non-leafs fall through as -ENOENT to
- * force insertion of the new leaf.
- */
- le32_add_cpu(&old->e_clusters, new_clusters);
- }
+ if (v_cluster < le32_to_cpu(rec->e_cpos))
+ break;
}
- if (ret == -ENOENT)
- ret = ocfs2_extent_map_insert(inode, rec, 0);
- if (ret < 0)
- mlog_errno(ret);
- return ret;
+ return i;
}
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
/*
- * Look up the record containing this cluster offset. This record is
- * part of the extent map. Do not free it. Any changes you make to
- * it will reflect in the extent map. So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
*
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
*
- * The lookup does not read from disk. If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped. After that, truncate and extend can happen. Caveat Emptor.
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
*/
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
- struct ocfs2_extent_rec **rec,
- int *tree_depth)
+static int ocfs2_figure_hole_clusters(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters)
{
- int ret = -ENOENT;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
+ int ret, i;
+ struct buffer_head *next_eb_bh = NULL;
+ struct ocfs2_extent_block *eb, *next_eb;
- *rec = NULL;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
- if (cpos >= OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+ eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (cpos >= em->em_clusters) {
/*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
+ * Check the next leaf for any extents.
*/
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters ;
- }
-
- ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
- NULL, NULL);
- if (ent) {
- *rec = &ent->e_rec;
- if (tree_depth)
- *tree_depth = ent->e_tree_depth;
- ret = 0;
- }
+ if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+ goto no_more_extents;
- return ret;
-}
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &next_eb_bh, OCFS2_BH_CACHED, inode);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-int ocfs2_extent_map_get_clusters(struct inode *inode,
- u32 v_cpos, int count,
- u32 *p_cpos, int *ret_count)
-{
- int ret;
- u32 coff, ccount;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent = NULL;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
+ ret = -EROFS;
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
+ goto out;
+ }
- *p_cpos = ccount = 0;
+ el = &next_eb->h_list;
- if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
+ }
- if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+ if (i == le16_to_cpu(el->l_next_free_rec)) {
/*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
+ * We're at the end of our existing allocation. Just
+ * return the maximum number of clusters we could
+ * possibly allocate.
*/
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
+ *num_clusters = UINT_MAX - v_cluster;
+ } else {
+ *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
}
+ ret = 0;
+out:
+ brelse(next_eb_bh);
+ return ret;
+}
- ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
- if (ret)
- return ret;
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+ u32 v_cluster)
+{
+ int ret = -1;
+ int i;
+ struct ocfs2_extent_rec *rec;
+ u32 rec_end, rec_start, clusters;
- if (ent) {
- /* We should never find ourselves straddling an interval */
- if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
- v_cpos,
- count))
- return -ESRCH;
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
- coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
- *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
- le64_to_cpu(ent->e_rec.e_blkno)) +
- coff;
+ rec_start = le32_to_cpu(rec->e_cpos);
+ clusters = ocfs2_rec_clusters(el, rec);
- if (ret_count)
- *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+ rec_end = rec_start + clusters;
- return 0;
+ if (v_cluster >= rec_start && v_cluster < rec_end) {
+ ret = i;
+ break;
+ }
}
-
- return -ENOENT;
+ return ret;
}
-#endif /* 0 */
-
-int ocfs2_extent_map_get_blocks(struct inode *inode,
- u64 v_blkno, int count,
- u64 *p_blkno, int *ret_count)
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ unsigned int *extent_flags)
{
- int ret;
- u64 boff;
- u32 cpos, clusters;
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- struct ocfs2_extent_map_entry *ent = NULL;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+ int ret, i;
+ unsigned int flags = 0;
+ struct buffer_head *di_bh = NULL;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
+ u32 coff;
- *p_blkno = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
- clusters = ocfs2_blocks_to_clusters(inode->i_sb,
- (u64)count + bpc - 1);
- if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
- ret = -EINVAL;
- mlog_errno(ret);
- return ret;
- }
-
- if ((cpos + clusters) > em->em_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+ if (ret == 0)
+ goto out;
- ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+ &di_bh, OCFS2_BH_CACHED, inode);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
}
- if (ent)
- {
- rec = &ent->e_rec;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ el = &di->id2.i_list;
- /* We should never find ourselv