aboutsummaryrefslogtreecommitdiff
path: root/fs/exofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/exofs')
-rw-r--r--fs/exofs/Kbuild6
-rw-r--r--fs/exofs/Kconfig.ore14
-rw-r--r--fs/exofs/common.h22
-rw-r--r--fs/exofs/dir.c77
-rw-r--r--fs/exofs/exofs.h189
-rw-r--r--fs/exofs/file.c34
-rw-r--r--fs/exofs/inode.c475
-rw-r--r--fs/exofs/ios.c803
-rw-r--r--fs/exofs/namei.c30
-rw-r--r--fs/exofs/ore.c1164
-rw-r--r--fs/exofs/ore_raid.c721
-rw-r--r--fs/exofs/ore_raid.h62
-rw-r--r--fs/exofs/pnfs.h45
-rw-r--r--fs/exofs/super.c538
-rw-r--r--fs/exofs/sys.c205
15 files changed, 3008 insertions, 1377 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 2d0f757fda3..389ba8312d5 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,9 @@
# Kbuild - Gets included from the Kernels Makefile and build system
#
-exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
+# ore module library
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
+
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
new file mode 100644
index 00000000000..2daf2329c28
--- /dev/null
+++ b/fs/exofs/Kconfig.ore
@@ -0,0 +1,14 @@
+# ORE - Objects Raid Engine (libore.ko)
+#
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
+config ORE
+ tristate
+ depends on EXOFS_FS || PNFS_OBJLAYOUT
+ select ASYNC_XOR
+ select RAID6_PQ
+ select ASYNC_PQ
+ default SCSI_OSD_ULD
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8..3bbd46956d7 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
/* exofs Application specific page/attribute */
+/* Inode attrs */
# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
# define EXOFS_ATTR_INODE_DATA 1
# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS 1
/*
* The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
*/
enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
struct exofs_fscb {
- __le64 s_nextid; /* Highest object ID used */
- __le64 s_numfiles; /* Number of files on fs */
+ __le64 s_nextid; /* Only used after mkfs */
+ __le64 s_numfiles; /* Only used after mkfs */
__le32 s_version; /* == EXOFS_FSCB_VER */
__le16 s_magic; /* Magic signature */
__le16 s_newfs; /* Non-zero if this is a new fs */
@@ -98,10 +102,20 @@ struct exofs_fscb {
} __packed;
/*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+ __le64 s_nextid; /* Highest object ID used */
+ __le64 s_numfiles; /* Number of files on fs */
+} __packed;
+
+/*
* Describes the raid used in the FS. It is part of the device table.
* This here is taken from the pNFS-objects definition. In exofs we
* use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at begining. We take care of it at exofs_device_table.
+ * alignment at beginning. We take care of it at exofs_device_table.
*/
struct exofs_dt_data_map {
__le32 cb_num_comps;
@@ -122,7 +136,7 @@ struct exofs_dt_device_info {
u8 systemid[OSD_SYSTEMID_LEN];
__le64 long_name_offset; /* If !0 then offset-in-file */
__le32 osdname_len; /* */
- u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
+ u8 osdname[44]; /* Embbeded, Usually an asci uuid */
} __packed;
/*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d6..49f51ab4caa 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
Ebadsize:
EXOFS_ERR("ERROR [exofs_check_page]: "
- "size of directory #%lu is not a multiple of chunk size",
+ "size of directory(0x%lx) is not a multiple of chunk size\n",
dir->i_ino
);
goto fail;
@@ -142,8 +142,8 @@ Espan:
goto bad_entry;
bad_entry:
EXOFS_ERR(
- "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+ "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
+ "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)),
rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
Eend:
p = (struct exofs_dir_entry *)(kaddr + offs);
EXOFS_ERR("ERROR [exofs_check_page]: "
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%llu",
+ "entry in directory(0x%lx) spans the page boundary"
+ "offset=%lu, inode=0x%llx\n",
dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)));
fail:
@@ -234,37 +234,33 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline
void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
{
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- unsigned char *types = NULL;
- int need_revalidate = (filp->f_version != inode->i_version);
+ int need_revalidate = (file->f_version != inode->i_version);
if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
return 0;
- types = exofs_filetype_table;
-
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct exofs_dir_entry *de;
struct page *page = exofs_get_page(inode, n);
if (IS_ERR(page)) {
- EXOFS_ERR("ERROR: "
- "bad page in #%lu",
- inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
+ inode->i_ino);
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -272,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -283,32 +279,30 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
for (; (char *)de <= limit; de = exofs_next_entry(de)) {
if (de->rec_len == 0) {
EXOFS_ERR("ERROR: "
- "zero-length directory entry");
+ "zero-length entry in directory(0x%lx)\n",
+ inode->i_ino);
exofs_put_page(page);
return -EIO;
}
if (de->inode_no) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < EXOFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < EXOFS_FT_MAX)
+ t = exofs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode_no),
- d_type);
- if (over) {
+ t)) {
exofs_put_page(page);
return 0;
}
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
exofs_put_page(page);
}
-
return 0;
}
@@ -342,9 +336,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
kaddr += exofs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
- EXOFS_ERR(
- "ERROR: exofs_find_entry: "
- "zero-length directory entry");
+ EXOFS_ERR("ERROR: zero-length entry in "
+ "directory(0x%lx)\n",
+ dir->i_ino);
exofs_put_page(page);
goto out;
}
@@ -472,7 +466,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
}
if (de->rec_len == 0) {
EXOFS_ERR("ERROR: exofs_add_link: "
- "zero-length directory entry");
+ "zero-length entry in directory(0x%lx)\n",
+ inode->i_ino);
err = -EIO;
goto out_unlock;
}
@@ -491,7 +486,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
exofs_put_page(page);
}
- EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+ EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+ dentry, inode->i_ino);
return -EINVAL;
got_it:
@@ -542,7 +538,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
while (de < dir) {
if (de->rec_len == 0) {
EXOFS_ERR("ERROR: exofs_delete_entry:"
- "zero-length directory entry");
+ "zero-length entry in directory(0x%lx)\n",
+ inode->i_ino);
err = -EIO;
goto out;
}
@@ -594,7 +591,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
de = (struct exofs_dir_entry *)kaddr;
de->name_len = 1;
de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
@@ -608,7 +605,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
de->inode_no = cpu_to_le64(parent->i_ino);
memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
page_cache_release(page);
@@ -666,5 +663,5 @@ not_empty:
const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = exofs_readdir,
+ .iterate = exofs_readdir,
};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa101..fffe86fd7a4 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -36,12 +36,9 @@
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/backing-dev.h>
-#include "common.h"
+#include <scsi/osd_ore.h>
-/* FIXME: Remove once pnfs hits mainline
- * #include <linux/exportfs/pnfs_osd_xdr.h>
- */
-#include "pnfs.h"
+#include "common.h"
#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
@@ -56,45 +53,30 @@
/* u64 has problems with printk this will cast it to unsigned long long */
#define _LLU(x) (unsigned long long)(x)
-struct exofs_layout {
- osd_id s_pid; /* partition ID of file system*/
-
- /* Our way of looking at the data_map */
- unsigned stripe_unit;
- unsigned mirrors_p1;
-
- unsigned group_width;
- u64 group_depth;
- unsigned group_count;
-
- enum exofs_inode_layout_gen_functions lay_func;
-
- unsigned s_numdevs; /* Num of devices in array */
- struct osd_dev *s_ods[0]; /* Variable length */
+struct exofs_dev {
+ struct ore_dev ored;
+ unsigned did;
+ unsigned urilen;
+ uint8_t *uri;
+ struct kobject ed_kobj;
};
-
/*
* our extension to the in-memory superblock
*/
struct exofs_sb_info {
- struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
+ struct backing_dev_info bdi; /* register our bdi with VFS */
+ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
uint32_t s_numfiles; /* number of files on fs */
spinlock_t s_next_gen_lock; /* spinlock for gen # update */
u32 s_next_generation; /* next gen # to use */
atomic_t s_curr_pending; /* number of pending commands */
- uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
- struct backing_dev_info bdi; /* register our bdi with VFS */
-
- struct pnfs_osd_data_map data_map; /* Default raid to use
- * FIXME: Needed ?
- */
-/* struct exofs_layout dir_layout;*/ /* Default dir layout */
- struct exofs_layout layout; /* Default files layout,
- * contains the variable osd_dev
- * array. Keep last */
- struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
+
+ struct ore_layout layout; /* Default files layout */
+ struct ore_comp one_comp; /* id & cred of partition id=0*/
+ struct ore_components oc; /* comps for the partition */
+ struct kobject s_kobj; /* holds per-sbi kobject */
};
/*
@@ -107,7 +89,8 @@ struct exofs_i_info {
uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
uint32_t i_dir_start_lookup; /* which page to start lookup */
uint64_t i_commit_size; /* the object's written length */
- uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
+ struct ore_comp one_comp; /* same component for all devices */
+ struct ore_components oc; /* inode view of the device table */
};
static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -115,52 +98,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
}
-struct exofs_io_state;
-typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
-
-struct exofs_io_state {
- struct kref kref;
-
- void *private;
- exofs_io_done_fn done;
-
- struct exofs_layout *layout;
- struct osd_obj_id obj;
- u8 *cred;
-
- /* Global read/write IO*/
- loff_t offset;
- unsigned long length;
- void *kern_buff;
-
- struct page **pages;
- unsigned nr_pages;
- unsigned pgbase;
- unsigned pages_consumed;
-
- /* Attributes */
- unsigned in_attr_len;
- struct osd_attr *in_attr;
- unsigned out_attr_len;
- struct osd_attr *out_attr;
-
- /* Variable array of size numdevs */
- unsigned numdevs;
- struct exofs_per_dev_state {
- struct osd_request *or;
- struct bio *bio;
- loff_t offset;
- unsigned length;
- unsigned dev;
- } per_dev[];
-};
-
-static inline unsigned exofs_io_state_size(unsigned numdevs)
-{
- return sizeof(struct exofs_io_state) +
- sizeof(struct exofs_per_dev_state) * numdevs;
-}
-
/*
* our inode flags
*/
@@ -205,12 +142,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
}
/*
- * Given a layout, object_number and stripe_index return the associated global
- * dev_index
- */
-unsigned exofs_layout_od_id(struct exofs_layout *layout,
- osd_id obj_no, unsigned layout_index);
-/*
* Maximum count of links to a file
*/
#define EXOFS_LINK_MAX 32000
@@ -219,49 +150,15 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout,
* function declarations *
*************************/
-/* ios.c */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj);
-int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
- u64 offset, void *p, unsigned length);
-
-int exofs_get_io_state(struct exofs_layout *layout,
- struct exofs_io_state **ios);
-void exofs_put_io_state(struct exofs_io_state *ios);
-
-int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
-
-int exofs_sbi_create(struct exofs_io_state *ios);
-int exofs_sbi_remove(struct exofs_io_state *ios);
-int exofs_sbi_write(struct exofs_io_state *ios);
-int exofs_sbi_read(struct exofs_io_state *ios);
-
-int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
-
-int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
-static inline int exofs_oi_write(struct exofs_i_info *oi,
- struct exofs_io_state *ios)
-{
- ios->obj.id = exofs_oi_objno(oi);
- ios->cred = oi->i_cred;
- return exofs_sbi_write(ios);
-}
-
-static inline int exofs_oi_read(struct exofs_i_info *oi,
- struct exofs_io_state *ios)
-{
- ios->obj.id = exofs_oi_objno(oi);
- ios->cred = oi->i_cred;
- return exofs_sbi_read(ios);
-}
-
/* inode.c */
+unsigned exofs_max_io_pages(struct ore_layout *layout,
+ unsigned expected_pages);
int exofs_setattr(struct dentry *, struct iattr *);
int exofs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
extern struct inode *exofs_iget(struct super_block *, unsigned long);
-struct inode *exofs_new_inode(struct inode *, int);
+struct inode *exofs_new_inode(struct inode *, umode_t);
extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
extern void exofs_evict_inode(struct inode *);
@@ -279,7 +176,19 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
struct inode *);
/* super.c */
-int exofs_sync_fs(struct super_block *sb, int wait);
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+ const struct osd_obj_id *obj);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
+
+/* sys.c */
+int exofs_sysfs_init(void);
+void exofs_sysfs_uninit(void);
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+ struct exofs_dt_device_info *dt_dev);
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
+int exofs_sysfs_odev_add(struct exofs_dev *edev,
+ struct exofs_sb_info *sbi);
+void exofs_sysfs_dbg_print(void);
/*********************
* operation vectors *
@@ -293,7 +202,6 @@ extern const struct file_operations exofs_file_operations;
/* inode.c */
extern const struct address_space_operations exofs_aops;
-extern const struct osd_attr g_attr_logical_length;
/* namei.c */
extern const struct inode_operations exofs_dir_inode_operations;
@@ -303,4 +211,35 @@ extern const struct inode_operations exofs_special_inode_operations;
extern const struct inode_operations exofs_symlink_inode_operations;
extern const struct inode_operations exofs_fast_symlink_inode_operations;
+/* exofs_init_comps will initialize an ore_components device array
+ * pointing to a single ore_comp struct, and a round-robin view
+ * of the device table.
+ * The first device of each inode is the [inode->ino % num_devices]
+ * and the rest of the devices sequentially following where the
+ * first device is after the last device.
+ * It is assumed that the global device array at @sbi is twice
+ * bigger and that the device table repeats twice.
+ * See: exofs_read_lookup_dev_table()
+ */
+static inline void exofs_init_comps(struct ore_components *oc,
+ struct ore_comp *one_comp,
+ struct exofs_sb_info *sbi, osd_id oid)
+{
+ unsigned dev_mod = (unsigned)oid, first_dev;
+
+ one_comp->obj.partition = sbi->one_comp.obj.partition;
+ one_comp->obj.id = oid;
+ exofs_make_credential(one_comp->cred, &one_comp->obj);
+
+ oc->first_dev = 0;
+ oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+ sbi->layout.group_count;
+ oc->single_comp = EC_SINGLE_COMP;
+ oc->comps = one_comp;
+
+ /* Round robin device view of the table */
+ first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+ oc->ods = &sbi->oc.ods[first_dev];
+}
+
#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0..71bf8e4fb5d 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -42,25 +42,19 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
* Note, in exofs all metadata is written as part of inode, regardless.
* The writeout is synchronous
*/
-static int exofs_file_fsync(struct file *filp, int datasync)
+static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
{
- int ret;
struct inode *inode = filp->f_mapping->host;
- struct super_block *sb;
-
- if (!(inode->i_state & I_DIRTY))
- return 0;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- return 0;
-
- ret = sync_inode_metadata(inode, 1);
+ int ret;
- /* This is a good place to write the sb */
- /* TODO: Sechedule an sb-sync on create */
- sb = inode->i_sb;
- if (sb->s_dirt)
- exofs_sync_fs(sb, 1);
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+ mutex_lock(&inode->i_mutex);
+ ret = sync_inode_metadata(filp->f_mapping->host, 1);
+ mutex_unlock(&inode->i_mutex);
return ret;
}
@@ -73,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
const struct file_operations exofs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = exofs_release_file,
.fsync = exofs_file_fsync,
.flush = exofs_flush,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a7555238c41..3f9cafd7393 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,17 +37,20 @@
#define EXOFS_DBGMSG2(M...) do {} while (0)
-enum { BIO_MAX_PAGES_KMALLOC =
- (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
- MAX_PAGES_KMALLOC =
- PAGE_SIZE / sizeof(struct page *),
-};
+unsigned exofs_max_io_pages(struct ore_layout *layout,
+ unsigned expected_pages)
+{
+ unsigned pages = min_t(unsigned, expected_pages,
+ layout->max_io_length / PAGE_SIZE);
+
+ return pages;
+}
struct page_collect {
struct exofs_sb_info *sbi;
struct inode *inode;
unsigned expected_pages;
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
struct page **pages;
unsigned alloc_pages;
@@ -57,6 +60,7 @@ struct page_collect {
bool read_4_write; /* This means two things: that the read is sync
* And the pages should not be unlocked.
*/
+ struct page *that_locked_page;
};
static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -75,6 +79,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
pcol->length = 0;
pcol->pg_first = -1;
pcol->read_4_write = false;
+ pcol->that_locked_page = NULL;
}
static void _pcol_reset(struct page_collect *pcol)
@@ -87,29 +92,22 @@ static void _pcol_reset(struct page_collect *pcol)
pcol->length = 0;
pcol->pg_first = -1;
pcol->ios = NULL;
+ pcol->that_locked_page = NULL;
/* this is probably the end of the loop but in writes
* it might not end here. don't be left with nothing
*/
if (!pcol->expected_pages)
- pcol->expected_pages = MAX_PAGES_KMALLOC;
+ pcol->expected_pages =
+ exofs_max_io_pages(&pcol->sbi->layout, ~0);
}
static int pcol_try_alloc(struct page_collect *pcol)
{
- unsigned pages = min_t(unsigned, pcol->expected_pages,
- MAX_PAGES_KMALLOC);
-
- if (!pcol->ios) { /* First time allocate io_state */
- int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
-
- if (ret)
- return ret;
- }
+ unsigned pages;
/* TODO: easily support bio chaining */
- pages = min_t(unsigned, pages,
- pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+ pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
for (; pages; pages >>= 1) {
pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -131,7 +129,7 @@ static void pcol_free(struct page_collect *pcol)
pcol->pages = NULL;
if (pcol->ios) {
- exofs_put_io_state(pcol->ios);
+ ore_put_io_state(pcol->ios);
pcol->ios = NULL;
}
}
@@ -147,14 +145,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
return 0;
}
+enum {PAGE_WAS_NOT_IN_IO = 17};
static int update_read_page(struct page *page, int ret)
{
- if (ret == 0) {
+ switch (ret) {
+ case 0:
/* Everything is OK */
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- } else if (ret == -EFAULT) {
+ break;
+ case -EFAULT:
/* In this case we were trying to read something that wasn't on
* disk yet - return a page full of zeroes. This should be OK,
* because the object should be empty (if there was a write
@@ -165,16 +166,22 @@ static int update_read_page(struct page *page, int ret)
SetPageUptodate(page);
if (PageError(page))
ClearPageError(page);
- ret = 0; /* recovered error */
EXOFS_DBGMSG("recovered read error\n");
- } else /* Error */
+ /* fall through */
+ case PAGE_WAS_NOT_IN_IO:
+ ret = 0; /* recovered error */
+ break;
+ default:
SetPageError(page);
-
+ }
return ret;
}
static void update_write_page(struct page *page, int ret)
{
+ if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+ return; /* don't pass start don't collect $200 */
+
if (ret) {
mapping_set_error(page->mapping, ret);
SetPageError(page);
@@ -188,15 +195,16 @@ static void update_write_page(struct page *page, int ret)
static int __readpages_done(struct page_collect *pcol)
{
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = exofs_check_io(pcol->ios, &resid);
+ int ret = ore_check_io(pcol->ios, NULL);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -232,7 +240,7 @@ static int __readpages_done(struct page_collect *pcol)
}
/* callback of async reads */
-static void readpages_done(struct exofs_io_state *ios, void *p)
+static void readpages_done(struct ore_io_state *ios, void *p)
{
struct page_collect *pcol = p;
@@ -257,23 +265,70 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
}
}
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+ struct page_collect *pcol_src, struct page_collect *pcol)
+{
+ /* length was wrong or offset was not page aligned */
+ BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+ if (pcol_src->nr_pages > ios->nr_pages) {
+ struct page **src_page;
+ unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+ unsigned long len_less = pcol_src->length - ios->length;
+ unsigned i;
+ int ret;
+
+ /* This IO was trimmed */
+ pcol_src->nr_pages = ios->nr_pages;
+ pcol_src->length = ios->length;
+
+ /* Left over pages are passed to the next io */
+ pcol->expected_pages += pages_less;
+ pcol->nr_pages = pages_less;
+ pcol->length = len_less;
+ src_page = pcol_src->pages + pcol_src->nr_pages;
+ pcol->pg_first = (*src_page)->index;
+
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < pages_less; ++i)
+ pcol->pages[i] = *src_page++;
+
+ EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+ "pages_less=0x%x expected_pages=0x%x "
+ "next_offset=0x%llx next_len=0x%lx\n",
+ pcol_src->nr_pages, pages_less, pcol->expected_pages,
+ pcol->pg_first * PAGE_SIZE, pcol->length);
+ }
+ return 0;
+}
+
static int read_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct exofs_io_state *ios = pcol->ios;
+ struct ore_io_state *ios;
struct page_collect *pcol_copy = NULL;
int ret;
if (!pcol->pages)
return 0;
+ if (!pcol->ios) {
+ int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
+ pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->length, &pcol->ios);
+
+ if (ret)
+ return ret;
+ }
+
+ ios = pcol->ios;
ios->pages = pcol->pages;
- ios->nr_pages = pcol->nr_pages;
- ios->length = pcol->length;
- ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
if (pcol->read_4_write) {
- exofs_oi_read(oi, pcol->ios);
+ ore_read(pcol->ios);
return __readpages_done(pcol);
}
@@ -286,26 +341,32 @@ static int read_exec(struct page_collect *pcol)
*pcol_copy = *pcol;
ios->done = readpages_done;
ios->private = pcol_copy;
- ret = exofs_oi_read(oi, ios);
+
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
if (unlikely(ret))
goto err;
- atomic_inc(&pcol->sbi->s_curr_pending);
+ EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
+ ret = ore_read(ios);
+ if (unlikely(ret))
+ goto err;
- EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
- ios->obj.id, _LLU(ios->offset), pcol->length);
+ atomic_inc(&pcol->sbi->s_curr_pending);
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
- if (!pcol->read_4_write)
- _unlock_pcol_pages(pcol, ret, READ);
-
- pcol_free(pcol);
-
+ if (!pcol_copy) /* Failed before ownership transfer */
+ pcol_copy = pcol;
+ _unlock_pcol_pages(pcol_copy, ret, READ);
+ pcol_free(pcol_copy);
kfree(pcol_copy);
+
return ret;
}
@@ -326,11 +387,15 @@ static int readpage_strip(void *data, struct page *page)
size_t len;
int ret;
+ BUG_ON(!PageLocked(page));
+
/* FIXME: Just for debugging, will be removed */
if (PageUptodate(page))
EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
page->index);
+ pcol->that_locked_page = page;
+
if (page->index < end_index)
len = PAGE_CACHE_SIZE;
else if (page->index == end_index)
@@ -350,8 +415,10 @@ static int readpage_strip(void *data, struct page *page)
if (!pcol->read_4_write)
unlock_page(page);
- EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
- " splitting\n", inode->i_ino, page->index);
+ EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
+ "read_4_write=%d index=0x%lx end_index=0x%lx "
+ "splitting\n", inode->i_ino, len,
+ pcol->read_4_write, page->index, end_index);
return read_exec(pcol);
}
@@ -417,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
return ret;
}
+ ret = read_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
return read_exec(&pcol);
}
@@ -446,21 +517,22 @@ static int exofs_readpage(struct file *file, struct page *page)
}
/* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct exofs_io_state *ios, void *p)
+static void writepages_done(struct ore_io_state *ios, void *p)
{
struct page_collect *pcol = p;
int i;
- u64 resid;
u64 good_bytes;
u64 length = 0;
- int ret = exofs_check_io(ios, &resid);
+ int ret = ore_check_io(ios, NULL);
atomic_dec(&pcol->sbi->s_curr_pending);
- if (likely(!ret))
+ if (likely(!ret)) {
good_bytes = pcol->length;
- else
- good_bytes = pcol->length - resid;
+ ret = PAGE_WAS_NOT_IN_IO;
+ } else {
+ good_bytes = 0;
+ }
EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
" length=0x%lx nr_pages=%u\n",
@@ -493,16 +565,82 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
EXOFS_DBGMSG2("writepages_done END\n");
}
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+ struct page_collect *pcol = priv;
+ pgoff_t index = offset / PAGE_SIZE;
+
+ if (!pcol->that_locked_page ||
+ (pcol->that_locked_page->index != index)) {
+ struct page *page;
+ loff_t i_size = i_size_read(pcol->inode);
+
+ if (offset >= i_size) {
+ *uptodate = true;
+ EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
+ return ZERO_PAGE(0);
+ }
+
+ page = find_get_page(pcol->inode->i_mapping, index);
+ if (!page) {
+ page = find_or_create_page(pcol->inode->i_mapping,
+ index, GFP_NOFS);
+ if (unlikely(!page)) {
+ EXOFS_DBGMSG("grab_cache_page Failed "
+ "index=0x%llx\n", _LLU(index));
+ return NULL;
+ }
+ unlock_page(page);
+ }
+ if (PageDirty(page) || PageWriteback(page))
+ *uptodate = true;
+ else
+ *uptodate = PageUptodate(page);
+ EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
+ return page;
+ } else {
+ EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
+ pcol->that_locked_page->index);
+ *uptodate = true;
+ return pcol->that_locked_page;
+ }
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+ struct page_collect *pcol = priv;
+
+ if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
+ EXOFS_DBGMSG2("index=0x%lx\n", page->index);
+ page_cache_release(page);
+ return;
+ }
+ EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
+ ZERO_PAGE(0) == page ? -1 : page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+ .get_page = &__r4w_get_page,
+ .put_page = &__r4w_put_page,
+};
+
static int write_exec(struct page_collect *pcol)
{
struct exofs_i_info *oi = exofs_i(pcol->inode);
- struct exofs_io_state *ios = pcol->ios;
+ struct ore_io_state *ios;
struct page_collect *pcol_copy = NULL;
int ret;
if (!pcol->pages)
return 0;
+ BUG_ON(pcol->ios);
+ ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
+ pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->length, &pcol->ios);
+ if (unlikely(ret))
+ goto err;
+
pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
if (!pcol_copy) {
EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
@@ -512,30 +650,36 @@ static int write_exec(struct page_collect *pcol)
*pcol_copy = *pcol;
+ ios = pcol->ios;
ios->pages = pcol_copy->pages;
- ios->nr_pages = pcol_copy->nr_pages;
- ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
- ios->length = pcol_copy->length;
ios->done = writepages_done;
+ ios->r4w = &_r4w_op;
ios->private = pcol_copy;
- ret = exofs_oi_write(oi, ios);
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+
+ ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+ if (unlikely(ret))
+ goto err;
+
+ EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+ pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
+ ret = ore_write(ios);
if (unlikely(ret)) {
- EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
+ EXOFS_ERR("write_exec: ore_write() Failed\n");
goto err;
}
atomic_inc(&pcol->sbi->s_curr_pending);
- EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
- pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
- pcol->length);
- /* pages ownership was passed to pcol_copy */
- _pcol_reset(pcol);
return 0;
err:
- _unlock_pcol_pages(pcol, ret, WRITE);
- pcol_free(pcol);
+ if (!pcol_copy) /* Failed before ownership transfer */
+ pcol_copy = pcol;
+ _unlock_pcol_pages(pcol_copy, ret, WRITE);
+ pcol_free(pcol_copy);
kfree(pcol_copy);
return ret;
@@ -670,14 +814,33 @@ static int exofs_writepages(struct address_space *mapping,
_pcol_init(&pcol, expected_pages, mapping->host);
ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
- if (ret) {
+ if (unlikely(ret)) {
EXOFS_ERR("write_cache_pages => %d\n", ret);
return ret;
}
- return write_exec(&pcol);
+ ret = write_exec(&pcol);
+ if (unlikely(ret))
+ return ret;
+
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ return write_exec(&pcol); /* pump the last reminder */
+ } else if (pcol.nr_pages) {
+ /* not SYNC let the reminder join the next writeout */
+ unsigned i;
+
+ for (i = 0; i < pcol.nr_pages; i++) {
+ struct page *page = pcol.pages[i];
+
+ end_page_writeback(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ }
+ return 0;
}
+/*
static int exofs_writepage(struct page *page, struct writeback_control *wbc)
{
struct page_collect pcol;
@@ -693,12 +856,12 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
return write_exec(&pcol);
}
-
+*/
/* i_mutex held using inode->i_size directly */
static void _write_failed(struct inode *inode, loff_t to)
{
if (to > inode->i_size)
- truncate_pagecache(inode, to, inode->i_size);
+ truncate_pagecache(inode, inode->i_size);
}
int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -722,11 +885,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
/* read modify write */
if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+ loff_t i_size = i_size_read(mapping->host);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ size_t rlen;
+
+ if (page->index < end_index)
+ rlen = PAGE_CACHE_SIZE;
+ else if (page->index == end_index)
+ rlen = i_size & ~PAGE_CACHE_MASK;
+ else
+ rlen = 0;
+
+ if (!rlen) {
+ clear_highpage(page);
+ SetPageUptodate(page);
+ goto out;
+ }
+
ret = _readpage(page, true);
if (ret) {
/*SetPageError was done by _readpage. Is it ok?*/
unlock_page(page);
- EXOFS_DBGMSG("__readpage_filler failed\n");
+ EXOFS_DBGMSG("__readpage failed\n");
}
}
out:
@@ -773,16 +953,26 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ return 0;
+}
+
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
- .writepage = exofs_writepage,
+ .writepage = NULL,
.writepages = exofs_writepages,
.write_begin = exofs_write_begin_export,
.write_end = exofs_write_end,
@@ -792,10 +982,9 @@ const struct address_space_operations exofs_aops = {
/* Not implemented Yet */
.bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = NULL, /* TODO: Should be trivial to do */
+ .direct_IO = exofs_direct_IO,
/* With these NULL has special meaning or default is not exported */
- .sync_page = NULL,
.get_xip_mem = NULL,
.migratepage = NULL,
.launder_page = NULL,
@@ -817,21 +1006,19 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
}
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-
static int _do_truncate(struct inode *inode, loff_t newsize)
{
struct exofs_i_info *oi = exofs_i(inode);
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
int ret;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ret = exofs_oi_truncate(oi, (u64)newsize);
+ ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
if (likely(!ret))
truncate_setsize(inode, newsize);
- EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+ EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
inode->i_ino, newsize, ret);
return ret;
}
@@ -890,43 +1077,38 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
[1] = g_attr_inode_file_layout,
[2] = g_attr_inode_dir_layout,
};
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
struct exofs_on_disk_inode_layout *layout;
int ret;
- ret = exofs_get_io_state(&sbi->layout, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
- EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+ EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
return ret;
}
- ios->obj.id = exofs_oi_objno(oi);
- exofs_make_credential(oi->i_cred, &ios->obj);
- ios->cred = oi->i_cred;
-
- attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
- attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+ attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+ attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
ios->in_attr = attrs;
ios->in_attr_len = ARRAY_SIZE(attrs);
- ret = exofs_sbi_read(ios);
+ ret = ore_read(ios);
if (unlikely(ret)) {
EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
- _LLU(ios->obj.id), ret);
+ _LLU(oi->one_comp.obj.id), ret);
memset(inode, 0, sizeof(*inode));
inode->i_mode = 0040000 | (0777 & ~022);
/* If object is lost on target we might as well enable it's
* delete.
*/
- if ((ret == -ENOENT) || (ret == -EINVAL))
- ret = 0;
+ ret = 0;
goto out;
}
ret = extract_attr_from_ios(ios, &attrs[0]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
goto out;
}
WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -934,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[1]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
goto out;
}
if (attrs[1].len) {
@@ -949,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[2]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
goto out;
}
if (attrs[2].len) {
@@ -963,7 +1145,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
}
out:
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
return ret;
}
@@ -989,6 +1171,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
return inode;
oi = exofs_i(inode);
__oi_init(oi);
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
+ exofs_oi_objno(oi));
/* read the inode from the osd */
ret = exofs_get_inode(sb, oi, &fcb);
@@ -999,9 +1183,9 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
/* copy stuff from on-disk struct to in-memory struct */
inode->i_mode = le16_to_cpu(fcb.i_mode);
- inode->i_uid = le32_to_cpu(fcb.i_uid);
- inode->i_gid = le32_to_cpu(fcb.i_gid);
- inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+ i_uid_write(inode, le32_to_cpu(fcb.i_uid));
+ i_gid_write(inode, le32_to_cpu(fcb.i_gid));
+ set_nlink(inode, le16_to_cpu(fcb.i_links_count));
inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
@@ -1030,6 +1214,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
}
+ inode->i_mapping->backing_dev_info = sb->s_bdi;
if (S_ISREG(inode->i_mode)) {
inode->i_op = &exofs_file_inode_operations;
inode->i_fop = &exofs_file_operations;
@@ -1073,26 +1258,28 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
}
return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
}
+
/*
* Callback function from exofs_new_inode(). The important thing is that we
* set the obj_created flag so that other methods know that the object exists on
* the OSD.
*/
-static void create_done(struct exofs_io_state *ios, void *p)
+static void create_done(struct ore_io_state *ios, void *p)
{
struct inode *inode = p;
struct exofs_i_info *oi = exofs_i(inode);
struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
int ret;
- ret = exofs_check_io(ios, NULL);
- exofs_put_io_state(ios);
+ ret = ore_check_io(ios, NULL);
+ ore_put_io_state(ios);
atomic_dec(&sbi->s_curr_pending);
if (unlikely(ret)) {
EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
- _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
+ _LLU(exofs_oi_objno(oi)),
+ _LLU(oi->one_comp.obj.partition));
/*TODO: When FS is corrupted creation can fail, object already
* exist. Get rid of this asynchronous creation, if exist
* increment the obj counter and try the next object. Until we
@@ -1109,16 +1296,15 @@ static void create_done(struct exofs_io_state *ios, void *p)
/*
* Set up a new inode and create an object for it on the OSD
*/
-struct inode *exofs_new_inode(struct inode *dir, int mode)
+struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
{
- struct super_block *sb;
+ struct super_block *sb = dir->i_sb;
+ struct exofs_sb_info *sbi = sb->s_fs_info;
struct inode *inode;
struct exofs_i_info *oi;
- struct exofs_sb_info *sbi;
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
int ret;
- sb = dir->i_sb;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -1128,9 +1314,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
set_obj_2bcreated(oi);
- sbi = sb->s_fs_info;
-
- sb->s_dirt = 1;
+ inode->i_mapping->backing_dev_info = sb->s_bdi;
inode_init_owner(inode, dir, mode);
inode->i_ino = sbi->s_nextid++;
inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1141,23 +1325,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
spin_unlock(&sbi->s_next_gen_lock);
insert_inode_hash(inode);
+ exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
+ exofs_oi_objno(oi));
+ exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
+
mark_inode_dirty(inode);
- ret = exofs_get_io_state(&sbi->layout, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
- EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+ EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
return ERR_PTR(ret);
}
- ios->obj.id = exofs_oi_objno(oi);
- exofs_make_credential(oi->i_cred, &ios->obj);
-
ios->done = create_done;
ios->private = inode;
- ios->cred = oi->i_cred;
- ret = exofs_sbi_create(ios);
+
+ ret = ore_create(ios);
if (ret) {
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
return ERR_PTR(ret);
}
atomic_inc(&sbi->s_curr_pending);
@@ -1176,11 +1361,11 @@ struct updatei_args {
/*
* Callback function from exofs_update_inode().
*/
-static void updatei_done(struct exofs_io_state *ios, void *p)
+static void updatei_done(struct ore_io_state *ios, void *p)
{
struct updatei_args *args = p;
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
atomic_dec(&args->sbi->s_curr_pending);
@@ -1196,7 +1381,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
struct exofs_i_info *oi = exofs_i(inode);
struct super_block *sb = inode->i_sb;
struct exofs_sb_info *sbi = sb->s_fs_info;
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
struct osd_attr attr;
struct exofs_fcb *fcb;
struct updatei_args *args;
@@ -1211,8 +1396,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
fcb = &args->fcb;
fcb->i_mode = cpu_to_le16(inode->i_mode);
- fcb->i_uid = cpu_to_le32(inode->i_uid);
- fcb->i_gid = cpu_to_le32(inode->i_gid);
+ fcb->i_uid = cpu_to_le32(i_uid_read(inode));
+ fcb->i_gid = cpu_to_le32(i_gid_read(inode));
fcb->i_links_count = cpu_to_le16(inode->i_nlink);
fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
@@ -1235,9 +1420,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
} else
memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
- ret = exofs_get_io_state(&sbi->layout, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
- EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+ EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
goto free_args;
}
@@ -1254,13 +1439,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
ios->private = args;
}
- ret = exofs_oi_write(oi, ios);
+ ret = ore_write(ios);
if (!do_sync && !ret) {
atomic_inc(&sbi->s_curr_pending);
goto out; /* deallocation in updatei_done */
}
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
free_args:
kfree(args);
out:
@@ -1271,18 +1456,19 @@ out:
int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+ /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+ return exofs_update_inode(inode, 1);
}
/*
* Callback function from exofs_delete_inode() - don't have much cleaning up to
* do.
*/
-static void delete_done(struct exofs_io_state *ios, void *p)
+static void delete_done(struct ore_io_state *ios, void *p)
{
struct exofs_sb_info *sbi = p;
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
atomic_dec(&sbi->s_curr_pending);
}
@@ -1297,17 +1483,17 @@ void exofs_evict_inode(struct inode *inode)
struct exofs_i_info *oi = exofs_i(inode);
struct super_block *sb = inode->i_sb;
struct exofs_sb_info *sbi = sb->s_fs_info;
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
int ret;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
/* TODO: should do better here */
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
inode->i_size = 0;
- end_writeback(inode);
+ clear_inode(inode);
/* if we are deleting an obj that hasn't been created yet, wait.
* This also makes sure that create_done cannot be called with an
@@ -1317,20 +1503,19 @@ void exofs_evict_inode(struct inode *inode)
/* ignore the error, attempt a remove anyway */
/* Now Remove the OSD objects */
- ret = exofs_get_io_state(&sbi->layout, &ios);
+ ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
if (unlikely(ret)) {
- EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
+ EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
return;
}
- ios->obj.id = exofs_oi_objno(oi);
ios->done = delete_done;
ios->private = sbi;
- ios->cred = oi->i_cred;
- ret = exofs_sbi_remove(ios);
+
+ ret = ore_remove(ios);
if (ret) {
- EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
- exofs_put_io_state(ios);
+ EXOFS_ERR("%s: ore_remove failed\n", __func__);
+ ore_put_io_state(ios);
return;
}
atomic_inc(&sbi->s_curr_pending);
@@ -1338,5 +1523,5 @@ void exofs_evict_inode(struct inode *inode)
return;
no_delete:
- end_writeback(inode);
+ clear_inode(inode);
}
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
deleted file mode 100644
index f74a2ec027a..00000000000
--- a/fs/exofs/ios.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/slab.h>
-#include <scsi/scsi_device.h>
-#include <asm/div64.h>
-
-#include "exofs.h"
-
-#define EXOFS_DBGMSG2(M...) do {} while (0)
-/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
-
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
- u64 offset, void *p, unsigned length)
-{
- struct osd_request *or = osd_start_request(od, GFP_KERNEL);
-/* struct osd_sense_info osi = {.key = 0};*/
- int ret;
-
- if (unlikely(!or)) {
- EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
- return -ENOMEM;
- }
- ret = osd_req_read_kern(or, obj, offset, p, length);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
- goto out;
- }
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
- goto out;
- }
-
- ret = osd_execute_request(or);
- if (unlikely(ret))
- EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
- /* osd_req_decode_sense(or, ret); */
-
-out:
- osd_end_request(or);
- return ret;
-}
-
-int exofs_get_io_state(struct exofs_layout *layout,
- struct exofs_io_state **pios)
-{
- struct exofs_io_state *ios;
-
- /*TODO: Maybe use kmem_cach per sbi of size
- * exofs_io_state_size(layout->s_numdevs)
- */
- ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
- if (unlikely(!ios)) {
- EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
- exofs_io_state_size(layout->s_numdevs));
- *pios = NULL;
- return -ENOMEM;
- }
-
- ios->layout = layout;
- ios->obj.partition = layout->s_pid;
- *pios = ios;
- return 0;
-}
-
-void exofs_put_io_state(struct exofs_io_state *ios)
-{
- if (ios) {
- unsigned i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
-
- if (per_dev->or)
- osd_end_request(per_dev->or);
- if (per_dev->bio)
- bio_put(per_dev->bio);
- }
-
- kfree(ios);
- }
-}
-
-unsigned exofs_layout_od_id(struct exofs_layout *layout,
- osd_id obj_no, unsigned layout_index)
-{
-/* switch (layout->lay_func) {
- case LAYOUT_MOVING_WINDOW:
- {*/
- unsigned dev_mod = obj_no;
-
- return (layout_index + dev_mod * layout->mirrors_p1) %
- layout->s_numdevs;
-/* }
- case LAYOUT_FUNC_IMPLICT:
- return layout->devs[layout_index];
- }*/
-}
-
-static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
- unsigned layout_index)
-{
- return ios->layout->s_ods[
- exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
-}
-
-static void _sync_done(struct exofs_io_state *ios, void *p)
-{
- struct completion *waiting = p;
-
- complete(waiting);
-}
-
-static void _last_io(struct kref *kref)
-{
- struct exofs_io_state *ios = container_of(
- kref, struct exofs_io_state, kref);
-
- ios->done(ios, ios->private);
-}
-
-static void _done_io(struct osd_request *or, void *p)
-{
- struct exofs_io_state *ios = p;
-
- kref_put(&ios->kref, _last_io);
-}
-
-static int exofs_io_execute(struct exofs_io_state *ios)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- bool sync = (ios->done == NULL);
- int i, ret;
-
- if (sync) {
- ios->done = _sync_done;
- ios->private = &wait;
- }
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- ret = osd_finalize_request(or, 0, ios->cred, NULL);
- if (unlikely(ret)) {
- EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
- ret);
- return ret;
- }
- }
-
- kref_init(&ios->kref);
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_request *or = ios->per_dev[i].or;
- if (unlikely(!or))
- continue;
-
- kref_get(&ios->kref);
- osd_execute_request_async(or, _done_io, ios);
- }
-
- kref_put(&ios->kref, _last_io);
- ret = 0;
-
- if (sync) {
- wait_for_completion(&wait);
- ret = exofs_check_io(ios, NULL);
- }
- return ret;
-}
-
-static void _clear_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
-
- __bio_for_each_segment(bv, bio, i, 0) {
- unsigned this_count = bv->bv_len;
-
- if (likely(PAGE_SIZE == this_count))
- clear_highpage(bv->bv_page);
- else
- zero_user(bv->bv_page, bv->bv_offset, this_count);
- }
-}
-
-int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
-{
- enum osd_err_priority acumulated_osd_err = 0;
- int acumulated_lin_err = 0;
- int i;
-
- for (i = 0; i < ios->numdevs; i++) {
- struct osd_sense_info osi;
- struct osd_request *or = ios->per_dev[i].or;
- int ret;
-
- if (unlikely(!or))
- continue;
-
- ret = osd_req_decode_sense(or, &osi);
- if (likely(!ret))
- continue;
-
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
- _clear_bio(ios->per_dev[i].bio);
- EXOFS_DBGMSG("start read offset passed end of file "
- "offset=0x%llx, length=0x%llx\n",
- _LLU(ios->per_dev[i].offset),
- _LLU(ios->per_dev[i].length));
-
- continue; /* we recovered */
- }
-
- if (osi.osd_err_pri >= acumulated_osd_err) {
- acumulated_osd_err = osi.osd_err_pri;
- acumulated_lin_err = ret;
- }
- }
-
- /* TODO: raid specific residual calculations */
- if (resid) {
- if (likely(!acumulated_lin_err))
- *resid = 0;
- else
- *resid = ios->length;
- }
-
- return acumulated_lin_err;
-}
-
-/*
- * L - logical offset into the file
- *
- * U - The number of bytes in a stripe within a group
- *
- * U = stripe_unit * group_width
- *
- * T - The number of bytes striped within a group of component objects
- * (before advancing to the next group)
- *
- * T = stripe_unit * group_width * group_depth
- *
- * S - The number of bytes striped across all component objects
- * before the pattern repeats
- *
- * S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
- * M = L / S
- *
- * G - Counts the groups from the beginning of the major stripe
- *
- * G = (L - (M * S)) / T [or (L % S) / T]
- *
- * H - The byte offset within the group
- *
- * H = (L - (M * S)) % T [or (L % S) % T]
- *
- * N - The "minor" (i.e., across the group) stripe number
- *
- * N = H / U
- *
- * C - The component index coresponding to L
- *
- * C = (H - (N * U)) / stripe_unit + G * group_width
- * [or (L % U) / stripe_unit + G * group_width]
- *
- * O - The component offset coresponding to L
- *
- * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
- */
-struct _striping_info {
- u64 obj_offset;
- u64 group_length;
- unsigned dev;
- unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
- struct _striping_info *si)
-{
- u32 stripe_unit = ios->layout->stripe_unit;
- u32 group_width = ios->layout->group_width;
- u64 group_depth = ios->layout->group_depth;
-
- u32 U = stripe_unit * group_width;
- u64 T = U * group_depth;
- u64 S = T * ios->layout->group_count;
- u64 M = div64_u64(file_offset, S);
-
- /*
- G = (L - (M * S)) / T
- H = (L - (M * S)) % T
- */
- u64 LmodS = file_offset - M * S;
- u32 G = div64_u64(LmodS, T);
- u64 H = LmodS - G * T;
-
- u32 N = div_u64(H, U);
-
- /* "H - (N * U)" is just "H % U" so it's bound to u32 */
- si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
- si->dev *= ios->layout->mirrors_p1;
-
- div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-
- si->obj_offset = si->unit_off + (N * stripe_unit) +
- (M * group_depth * stripe_unit);
-
- si->group_length = T - H;
-}
-
-static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
- unsigned pgbase, struct exofs_per_dev_state *per_dev,
- int cur_len)
-{
- unsigned pg = *cur_pg;
- struct request_queue *q =
- osd_request_queue(exofs_ios_od(ios, per_dev->dev));
-
- per_dev->length += cur_len;
-
- if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
- ios->layout->group_width;
-
- per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
- if (unlikely(!per_dev->bio)) {
- EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
- bio_size);
- return -ENOMEM;
- }
- }
-
- while (cur_len > 0) {
- unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
- unsigned added_len;
-
- BUG_ON(ios->nr_pages <= pg);
- cur_len -= pglen;
-
- added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
- pglen, pgbase);
- if (unlikely(pglen != added_len))
- return -ENOMEM;
- pgbase = 0;
- ++pg;
- }
- BUG_ON(cur_len);
-
- *cur_pg = pg;
- return 0;
-}
-
-static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
- struct _striping_info *si)
-{
- unsigned stripe_unit = ios->layout->stripe_unit;
- unsigned mirrors_p1 = ios->layout->mirrors_p1;
- unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
- unsigned dev = si->dev;
- unsigned first_dev = dev - (dev % devs_in_group);
- unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
- unsigned cur_pg = ios->pages_consumed;
- int ret = 0;
-
- while (length) {
- struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
- unsigned cur_len, page_off = 0;
-
- if (!per_dev->length) {
- per_dev->dev = dev;
- if (dev < si->dev) {
- per_dev->offset = si->obj_offset + stripe_unit -
- si->unit_off;
- cur_len = stripe_unit;
- } else if (dev == si->dev) {
- per_dev->offset = si->obj_offset;
- cur_len = stripe_unit - si->unit_off;
- page_off = si->unit_off & ~PAGE_MASK;
- BUG_ON(page_off && (page_off != ios->pgbase));
- } else { /* dev > si->dev */
- per_dev->offset = si->obj_offset - si->unit_off;
- cur_len = stripe_unit;
- }
-
- if (max_comp < dev)
- max_comp = dev;
- } else {
- cur_len = stripe_unit;
- }
- if (cur_len >= length)
- cur_len = length;
-
- ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
- cur_len);
- if (unlikely(ret))
- goto out;
-
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
- length -= cur_len;
- }
-out:
- ios->numdevs = max_comp + mirrors_p1;
- ios->pages_consumed = cur_pg;
- return ret;
-}
-
-static int _prepare_for_striping(struct exofs_io_state *ios)
-{
- u64 length = ios->length;
- u64 offset = ios->offset;
- struct _striping_info si;
- int ret = 0;
-
- if (!ios->pages) {
- if (ios->kern_buff) {
- struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
-
- _calc_stripe_info(ios, ios->offset, &si);
- per_dev->offset = si.obj_offset;
- per_dev->dev = si.dev;
-
- /* no cross device without page array */
- BUG_ON((ios->layout->group_width > 1) &&
- (si.unit_off + ios->length >
- ios->layout->stripe_unit));
- }
- ios->numdevs = ios->layout->mirrors_p1;
- return 0;
- }
-
- while (length) {
- _calc_stripe_info(ios, offset, &si);
-
- if (length < si.group_length)
- si.group_length = length;
-
- ret = _prepare_one_group(ios, si.group_length, &si);
- if (unlikely(ret))
- goto out;
-
- offset += si.group_length;
- length -= si.group_length;
- }
-
-out:
- return ret;
-}
-
-int exofs_sbi_create(struct exofs_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->layout->s_numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
- if (unlikely(!or)) {
- EXOFS_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_create_object(or, &ios->obj);
- }
- ret = exofs_io_execute(ios);
-
-out:
- return ret;
-}
-
-int exofs_sbi_remove(struct exofs_io_state *ios)
-{
- int i, ret;
-
- for (i = 0; i < ios->layout->s_numdevs; i++) {
- struct osd_request *or;
-
- or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
- if (unlikely(!or)) {
- EXOFS_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- ios->per_dev[i].or = or;
- ios->numdevs++;
-
- osd_req_remove_object(or, &ios->obj);
- }
- ret = exofs_io_execute(ios);
-
-out:
- return ret;
-}
-
-static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
-{
- struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
- unsigned dev = ios->per_dev[cur_comp].dev;
- unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
- int ret = 0;
-
- if (ios->pages && !master_dev->length)
- return 0; /* Just an empty slot */
-
- for (; cur_comp < last_comp; ++cur_comp, ++dev) {
- struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
- if (unlikely(!or)) {
- EXOFS_ERR("%s: osd_start_request failed\n", __func__);
- ret = -ENOMEM;
- goto out;
- }
- per_dev->or = or;
- per_dev->offset = master_dev->offset;
-
- if (ios->pages) {
- struct bio *bio;
-
- if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_KERNEL,
- master_dev->bio->bi_max_vecs);
- if (unlikely(!bio)) {
- EXOFS_DBGMSG(
- "Failed to allocate BIO size=%u\n",
- master_dev->bio->bi_max_vecs);
- ret = -ENOMEM;
- goto out;
- }
-
- __bio_clone(bio, master_dev->bio);
- bio->bi_bdev = NULL;
- bio->bi_next = NULL;
- per_dev->length = master_dev->length;
- per_dev->bio = bio;
- per_dev->dev = dev;
- } else {
- bio = master_dev->bio;
- /* FIXME: bio_set_dir() */
- bio->bi_rw |= REQ_WRITE;
- }
-
- osd_req_write(or, &ios->obj, per_dev->offset, bio,
- per_dev->length);
- EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(ios->obj.id), _LLU(per_dev->offset),
- _LLU(per_dev->length), dev);
- } else if (ios->kern_buff) {
- ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
- ios->kern_buff, ios->length);
- if (unlikely(ret))
- goto out;
- EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d\n",
- _LLU(ios->obj.id), _LLU(per_dev->offset),
- _LLU(ios->length), dev);
- } else {
- osd_req_set_attributes(or, &ios->obj);
- EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
- _LLU(ios->obj.id), ios->out_attr_len, dev);
- }
-
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr,
- ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr,
- ios->in_attr_len);
- }
-
-out:
- return ret;
-}
-
-int exofs_sbi_write(struct exofs_io_state *ios)
-{
- int i;
- int ret;
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _sbi_write_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = exofs_io_execute(ios);
- return ret;
-}
-
-static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
-{
- struct osd_request *or;
- struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- unsigned first_dev = (unsigned)ios->obj.id;
-
- if (ios->pages && !per_dev->length)
- return 0; /* Just an empty slot */
-
- first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
- or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
- if (unlikely(!or)) {
- EXOFS_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- if (ios->pages) {
- osd_req_read(or, &ios->obj, per_dev->offset,
- per_dev->bio, per_dev->length);
- EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
- " dev=%d\n", _LLU(ios->obj.id),
- _LLU(per_dev->offset), _LLU(per_dev->length),
- first_dev);
- } else if (ios->kern_buff) {
- int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
- ios->kern_buff, ios->length);
- EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
- "length=0x%llx dev=%d ret=>%d\n",
- _LLU(ios->obj.id), _LLU(per_dev->offset),
- _LLU(ios->length), first_dev, ret);
- if (unlikely(ret))
- return ret;
- } else {
- osd_req_get_attributes(or, &ios->obj);
- EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
- _LLU(ios->obj.id), ios->in_attr_len, first_dev);
- }
- if (ios->out_attr)
- osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
-
- if (ios->in_attr)
- osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
-
- return 0;
-}
-
-int exofs_sbi_read(struct exofs_io_state *ios)
-{
- int i;
- int ret;
-
- ret = _prepare_for_striping(ios);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
- ret = _sbi_read_mirror(ios, i);
- if (unlikely(ret))
- return ret;
- }
-
- ret = exofs_io_execute(ios);
- return ret;
-}
-
-int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(ios->per_dev[0].or,
- &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-
-static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
- struct osd_attr *attr)
-{
- int last_comp = cur_comp + ios->layout->mirrors_p1;
-
- for (; cur_comp < last_comp; ++cur_comp) {
- struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
- struct osd_request *or;
-
- or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
- if (unlikely(!or)) {
- EXOFS_ERR("%s: osd_start_request failed\n", __func__);
- return -ENOMEM;
- }
- per_dev->or = or;
-
- osd_req_set_attributes(or, &ios->obj);
- osd_req_add_set_attr_list(or, attr, 1);
- }
-
- return 0;
-}
-
-int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
-{
- struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
- struct exofs_io_state *ios;
- struct exofs_trunc_attr {
- struct osd_attr attr;
- __be64 newsize;
- } *size_attrs;
- struct _striping_info si;
- int i, ret;
-
- ret = exofs_get_io_state(&sbi->layout, &ios);
- if (unlikely(ret))
- return ret;
-
- size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
- GFP_KERNEL);
- if (unlikely(!size_attrs)) {
- ret = -ENOMEM;
- goto out;
- }
-
- ios->obj.id = exofs_oi_objno(oi);
- ios->cred = oi->i_cred;
-
- ios->numdevs = ios->layout->s_numdevs;
- _calc_stripe_info(ios, size, &si);
-
- for (i = 0; i < ios->layout->group_width; ++i) {
- struct exofs_trunc_attr *size_attr = &size_attrs[i];
- u64 obj_size;
-
- if (i < si.dev)
- obj_size = si.obj_offset +
- ios->layout->stripe_unit - si.unit_off;
- else if (i == si.dev)
- obj_size = si.obj_offset;
- else /* i > si.dev */
- obj_size = si.obj_offset - si.unit_off;
-
- size_attr->newsize = cpu_to_be64(obj_size);
- size_attr->attr = g_attr_logical_length;
- size_attr->attr.val_ptr = &size_attr->newsize;
-
- ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
- &size_attr->attr);
- if (unlikely(ret))
- goto out;
- }
- ret = exofs_io_execute(ios);
-
-out:
- kfree(size_attrs);
- exofs_put_io_state(ios);
- return ret;
-}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 4d70db110cf..4731fd991ef 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
}
static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct inode *inode;
ino_t ino;
@@ -55,17 +55,12 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
ino = exofs_inode_by_name(dir, dentry);
- inode = NULL;
- if (ino) {
- inode = exofs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
+ inode = ino ? exofs_iget(dir->i_sb, ino) : NULL;
return d_splice_alias(inode, dentry);
}
-static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
struct inode *inode = exofs_new_inode(dir, mode);
int err = PTR_ERR(inode);
@@ -79,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
return err;
}
-static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
struct inode *inode;
@@ -148,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= EXOFS_LINK_MAX)
- return -EMLINK;
-
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
ihold(inode);
@@ -158,13 +150,10 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
return exofs_add_nondir(dentry, inode);
}
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= EXOFS_LINK_MAX)
- goto out;
+ int err;
inode_inc_link_count(dir);
@@ -280,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto out_dir;
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= EXOFS_LINK_MAX)
- goto out_dir;
- }
err = exofs_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
new file mode 100644
index 00000000000..cfc0205d62c
--- /dev/null
+++ b/fs/exofs/ore.c
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com)
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <asm/div64.h>
+#include <linux/lcm.h>
+
+#include "ore_raid.h"
+
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ * members to be operatonals for the ore. The needed parameters are those
+ * that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ * for example stripe_unit must be a multple of the system PAGE_SIZE,
+ * and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
+
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+ u64 stripe_length;
+
+ switch (layout->raid_algorithm) {
+ case PNFS_OSD_RAID_0:
+ layout->parity = 0;
+ break;
+ case PNFS_OSD_RAID_5:
+ layout->parity = 1;
+ break;
+ case PNFS_OSD_RAID_PQ:
+ layout->parity = 2;
+ break;
+ case PNFS_OSD_RAID_4:
+ default:
+ ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+ layout->raid_algorithm);
+ return -EINVAL;
+ }
+ if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+ ORE_ERR("Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(layout->stripe_unit), PAGE_SIZE);
+ return -EINVAL;
+ }
+ if (layout->group_width) {
+ if (!layout->group_depth) {
+ ORE_ERR("group_depth == 0 && group_width != 0\n");
+ return -EINVAL;
+ }
+ if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+ ORE_ERR("Data Map wrong, "
+ "numdevs=%d < group_width=%d * mirrors=%d\n",
+ total_comps, layout->group_width,
+ layout->mirrors_p1);
+ return -EINVAL;
+ }
+ layout->group_count = total_comps / layout->mirrors_p1 /
+ layout->group_width;
+ } else {
+ if (layout->group_depth) {
+ printk(KERN_NOTICE "Warning: group_depth ignored "
+ "group_width == 0 && group_depth == %lld\n",
+ _LLU(layout->group_depth));
+ }
+ layout->group_width = total_comps / layout->mirrors_p1;
+ layout->group_depth = -1;
+ layout->group_count = 1;
+ }
+
+ stripe_length = (u64)layout->group_width * layout->stripe_unit;
+ if (stripe_length >= (1ULL << 32)) {
+ ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+ _LLU(stripe_length));
+ return -EINVAL;
+ }
+
+ layout->max_io_length =
+ (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+ (layout->group_width - layout->parity);
+ if (layout->parity) {
+ unsigned stripe_length =
+ (layout->group_width - layout->parity) *
+ layout->stripe_unit;
+
+ layout->max_io_length /= stripe_length;
+ layout->max_io_length *= stripe_length;
+ }
+ ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
+ return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
+
+static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
+{
+ return ios->oc->comps[index & ios->oc->single_comp].cred;
+}
+
+static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
+{
+ return &ios->oc->comps[index & ios->oc->single_comp].obj;
+}
+
+static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
+{
+ ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+ ios->oc->first_dev, ios->oc->numdevs, index,
+ ios->oc->ods);
+
+ return ore_comp_dev(ios->oc, index);
+}
+
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios)
+{
+ struct ore_io_state *ios;
+ struct page **pages;
+ struct osd_sg_entry *sgilist;
+ struct __alloc_all_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ union {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ };
+ } *_aios;
+
+ if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+ _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+ if (unlikely(!_aios)) {
+ ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+ sizeof(*_aios));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ pages = num_par_pages ? _aios->pages : NULL;
+ sgilist = sgs_per_dev ? _aios->sglist : NULL;
+ ios = &_aios->ios;
+ } else {
+ struct __alloc_small_io_state {
+ struct ore_io_state ios;
+ struct ore_per_dev_state per_dev[numdevs];
+ } *_aio_small;
+ union __extra_part {
+ struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+ struct page *pages[num_par_pages];
+ } *extra_part;
+
+ _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+ if (unlikely(!_aio_small)) {
+ ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+ sizeof(*_aio_small));
+ *pios = NULL;
+ return -ENOMEM;
+ }
+ extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+ if (unlikely(!extra_part)) {
+ ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+ sizeof(*extra_part));
+ kfree(_aio_small);
+ *pios = NULL;
+ return -ENOMEM;
+ }
+
+ pages = num_par_pages ? extra_part->pages : NULL;
+ sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+ /* In this case the per_dev[0].sgilist holds the pointer to
+ * be freed
+ */
+ ios = &_aio_small->ios;
+ ios->extra_part_alloc = true;
+ }
+
+ if (pages) {
+ ios->parity_pages = pages;
+ ios->max_par_pages = num_par_pages;
+ }
+ if (sgilist) {
+ unsigned d;
+
+ for (d = 0; d < numdevs; ++d) {
+ ios->per_dev[d].sglist = sgilist;
+ sgilist += sgs_per_dev;
+ }
+ ios->sgs_per_dev = sgs_per_dev;
+ }
+
+ ios->layout = layout;
+ ios->oc = oc;
+ *pios = ios;
+ return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ * (@offset + @length) % strip_size == 0. Or the complete range is within a
+ * single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ * And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
+ bool is_reading, u64 offset, u64 length,
+ struct ore_io_state **pios)
+{
+ struct ore_io_state *ios;
+ unsigned numdevs = layout->group_width * layout->mirrors_p1;
+ unsigned sgs_per_dev = 0, max_par_pages = 0;
+ int ret;
+
+ if (layout->parity && length) {
+ unsigned data_devs = layout->group_width - layout->parity;
+ unsigned stripe_size = layout->stripe_unit * data_devs;
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+ u32 remainder;
+ u64 num_stripes;
+ u64 num_raid_units;
+
+ num_stripes = div_u64_rem(length, stripe_size, &remainder);
+ if (remainder)
+ ++num_stripes;
+
+ num_raid_units = num_stripes * layout->parity;
+
+ if (is_reading) {
+ /* For reads add per_dev sglist array */
+ /* TODO: Raid 6 we need twice more. Actually:
+ * num_stripes / LCMdP(W,P);
+ * if (W%P != 0) num_stripes *= parity;
+ */
+
+ /* first/last seg is split */
+ num_raid_units += layout->group_width;
+ sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
+ } else {
+ /* For Writes add parity pages array. */
+ max_par_pages = num_raid_units * pages_in_unit *
+ sizeof(struct page *);
+ }
+ }
+
+ ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+ pios);
+ if (unlikely(ret))
+ return ret;
+
+ ios = *pios;
+ ios->reading = is_reading;
+ ios->offset = offset;
+
+ if (length) {
+ ore_calc_stripe_info(layout, offset, length, &ios->si);
+ ios->length = ios->si.length;
+ ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+ ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (layout->parity)
+ _ore_post_alloc_raid_stuff(ios);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ore_get_rw_state);
+
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+ struct ore_io_state **pios)
+{
+ return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
+}
+EXPORT_SYMBOL(ore_get_io_state);
+
+void ore_put_io_state(struct ore_io_state *ios)
+{
+ if (ios) {
+ unsigned i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+
+ if (per_dev->or)
+ osd_end_request(per_dev->or);
+ if (per_dev->bio)
+ bio_put(per_dev->bio);
+ }
+
+ _ore_free_raid_stuff(ios);
+ kfree(ios);
+ }
+}
+EXPORT_SYMBOL(ore_put_io_state);
+
+static void _sync_done(struct ore_io_state *ios, void *p)
+{
+ struct completion *waiting = p;
+
+ complete(waiting);
+}
+
+static void _last_io(struct kref *kref)
+{
+ struct ore_io_state *ios = container_of(
+ kref, struct ore_io_state, kref);
+
+ ios->done(ios, ios->private);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+ struct ore_io_state *ios = p;
+
+ kref_put(&ios->kref, _last_io);
+}
+
+int ore_io_execute(struct ore_io_state *ios)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ bool sync = (ios->done == NULL);
+ int i, ret;
+
+ if (sync) {
+ ios->done = _sync_done;
+ ios->private = &wait;
+ }
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+ if (unlikely(!or))
+ continue;
+
+ ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
+ if (unlikely(ret)) {
+ ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
+ ret);
+ return ret;
+ }
+ }
+
+ kref_init(&ios->kref);
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+ if (unlikely(!or))
+ continue;
+
+ kref_get(&ios->kref);
+ osd_execute_request_async(or, _done_io, ios);
+ }
+
+ kref_put(&ios->kref, _last_io);
+ ret = 0;
+
+ if (sync) {
+ wait_for_completion(&wait);
+ ret = ore_check_io(ios, NULL);
+ }
+ return ret;
+}
+
+static void _clear_bio(struct bio *bio)
+{
+ struct bio_vec *bv;
+ unsigned i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ unsigned this_count = bv->bv_len;
+
+ if (likely(PAGE_SIZE == this_count))
+ clear_highpage(bv->bv_page);
+ else
+ zero_user(bv->bv_page, bv->bv_offset, this_count);
+ }
+}
+
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
+{
+ enum osd_err_priority acumulated_osd_err = 0;
+ int acumulated_lin_err = 0;
+ int i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_sense_info osi;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+ struct osd_request *or = per_dev->or;
+ int ret;
+
+ if (unlikely(!or))
+ continue;
+
+ ret = osd_req_decode_sense(or, &osi);
+ if (likely(!ret))
+ continue;
+
+ if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+ per_dev->bio) {
+ /* start read offset passed endof file.
+ * Note: if we do not have bio it means read-attributes
+ * In this case we should return error to caller.
+ */
+ _clear_bio(per_dev->bio);
+ ORE_DBGMSG("start read offset passed end of file "
+ "offset=0x%llx, length=0x%llx\n",
+ _LLU(per_dev->offset),
+ _LLU(per_dev->length));
+
+ continue; /* we recovered */
+ }
+
+ if (on_dev_error) {
+ u64 residual = ios->reading ?
+ or->in.residual : or->out.residual;
+ u64 offset = (ios->offset + ios->length) - residual;
+ unsigned dev = per_dev->dev - ios->oc->first_dev;
+ struct ore_dev *od = ios->oc->ods[dev];
+
+ on_dev_error(ios, od, dev, osi.osd_err_pri,
+ offset, residual);
+ }
+ if (osi.osd_err_pri >= acumulated_osd_err) {
+ acumulated_osd_err = osi.osd_err_pri;
+ acumulated_lin_err = ret;
+ }
+ }
+
+ return acumulated_lin_err;
+}
+EXPORT_SYMBOL(ore_check_io);
+
+/*
+ * L - logical offset into the file
+ *
+ * D - number of Data devices
+ * D = group_width - parity
+ *
+ * U - The number of bytes in a stripe within a group
+ * U = stripe_unit * D
+ *
+ * T - The number of bytes striped within a group of component objects
+ * (before advancing to the next group)
+ * T = U * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ * before the pattern repeats
+ * S = T * group_count
+ *
+ * M - The "major" (i.e., across all components) cycle number
+ * M = L / S
+ *
+ * G - Counts the groups from the beginning of the major cycle
+ * G = (L - (M * S)) / T [or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ * H = (L - (M * S)) % T [or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ * N = H / U
+ *
+ * C - The component index coresponding to L
+ *
+ * C = (H - (N * U)) / stripe_unit + G * D
+ * [or (L % U) / stripe_unit + G * D]
+ *
+ * O - The component offset coresponding to L
+ * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ * divide by parity
+ * LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ * (Note parity cycle always starts at a group's boundary)
+ * R = N % LCMdP
+ *
+ * I = the first parity device index
+ * I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ * Craid = (group_width + C - R*parity) % group_width
+ * (We add the group_width to avoid negative numbers modulo math)
+ */
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+ u64 length, struct ore_striping_info *si)
+{
+ u32 stripe_unit = layout->stripe_unit;
+ u32 group_width = layout->group_width;
+ u64 group_depth = layout->group_depth;
+ u32 parity = layout->parity;
+
+ u32 D = group_width - parity;
+ u32 U = D * stripe_unit;
+ u64 T = U * group_depth;
+ u64 S = T * layout->group_count;
+ u64 M = div64_u64(file_offset, S);
+
+ /*
+ G = (L - (M * S)) / T
+ H = (L - (M * S)) % T
+ */
+ u64 LmodS = file_offset - M * S;
+ u32 G = div64_u64(LmodS, T);
+ u64 H = LmodS - G * T;
+
+ u32 N = div_u64(H, U);
+ u32 Nlast;
+
+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ u32 first_dev = C - C % group_width;
+
+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+
+ si->obj_offset = si->unit_off + (N * stripe_unit) +
+ (M * group_depth * stripe_unit);
+ si->cur_comp = C - first_dev;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
+
+ if (parity) {
+ u32 LCMdP = lcm(group_width, parity) / parity;
+ /* R = N % LCMdP; */
+ u32 RxP = (N % LCMdP) * parity;
+
+ si->par_dev = (group_width + group_width - parity - RxP) %
+ group_width + first_dev;
+ si->dev = (group_width + group_width + C - RxP) %
+ group_width + first_dev;
+ si->bytes_in_stripe = U;
+ si->first_stripe_start = M * S + G * T + N * U;
+ } else {
+ /* Make the math correct see _prepare_one_group */
+ si->par_dev = group_width;
+ si->dev = C;
+ }
+
+ si->dev *= layout->mirrors_p1;
+ si->par_dev *= layout->mirrors_p1;
+ si->offset = file_offset;
+ si->length = T - H;
+ if (si->length > length)
+ si->length = length;
+
+ Nlast = div_u64(H + si->length + U - 1, U);
+ si->maxdevUnits = Nlast - N;
+
+ si->M = M;
+}
+EXPORT_SYMBOL(ore_calc_stripe_info);
+
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len)
+{
+ unsigned pg = *cur_pg;
+ struct request_queue *q =
+ osd_request_queue(_ios_od(ios, per_dev->dev));
+ unsigned len = cur_len;
+ int ret;
+
+ if (per_dev->bio == NULL) {
+ unsigned bio_size;
+
+ if (!ios->reading) {
+ bio_size = ios->si.maxdevUnits;
+ } else {
+ bio_size = (ios->si.maxdevUnits + 1) *
+ (ios->layout->group_width - ios->layout->parity) /
+ ios->layout->group_width;
+ }
+ bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
+
+ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+ if (unlikely(!per_dev->bio)) {
+ ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+ bio_size);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ while (cur_len > 0) {
+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+ unsigned added_len;
+
+ cur_len -= pglen;
+
+ added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
+ pglen, pgbase);
+ if (unlikely(pglen != added_len)) {
+ /* If bi_vcnt == bi_max then this is a SW BUG */
+ ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+ "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+ per_dev->bio->bi_vcnt,
+ per_dev->bio->bi_max_vecs,
+ BIO_MAX_PAGES_KMALLOC, cur_len);
+ ret = -ENOMEM;
+ goto out;
+ }
+ _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
+ pgbase = 0;
+ ++pg;
+ }
+ BUG_ON(cur_len);
+
+ per_dev->length += len;
+ *cur_pg = pg;
+ ret = 0;
+out: /* we fail the complete unit on an error eg don't advance
+ * per_dev->length and cur_pg. This means that we might have a bigger
+ * bio than the CDB requested length (per_dev->length). That's fine
+ * only the oposite is fatal.
+ */
+ return ret;
+}
+
+static int _add_parity_units(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ unsigned dev, unsigned first_dev,
+ unsigned mirrors_p1, unsigned devs_in_group,
+ unsigned cur_len)
+{
+ unsigned do_parity;
+ int ret = 0;
+
+ for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+ struct ore_per_dev_state *per_dev;
+
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length && !per_dev->offset) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
+
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+ do_parity == 1);
+ if (unlikely(ret))
+ break;
+
+ if (do_parity != 1) {
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+ si->cur_comp = (si->cur_comp + 1) %
+ ios->layout->group_width;
+ }
+ }
+
+ return ret;
+}
+
+static int _prepare_for_striping(struct ore_io_state *ios)
+{
+ struct ore_striping_info *si = &ios->si;
+ unsigned stripe_unit = ios->layout->stripe_unit;
+ unsigned mirrors_p1 = ios->layout->mirrors_p1;
+ unsigned group_width = ios->layout->group_width;
+ unsigned devs_in_group = group_width * mirrors_p1;
+ unsigned dev = si->dev;
+ unsigned first_dev = dev - (dev % devs_in_group);
+ unsigned cur_pg = ios->pages_consumed;
+ u64 length = ios->length;
+ int ret = 0;
+
+ if (!ios->pages) {
+ ios->numdevs = ios->layout->mirrors_p1;
+ return 0;
+ }
+
+ BUG_ON(length > si->length);
+
+ while (length) {
+ struct ore_per_dev_state *per_dev =
+ &ios->per_dev[dev - first_dev];
+ unsigned cur_len, page_off = 0;
+
+ if (!per_dev->length && !per_dev->offset) {
+ /* First time initialize the per_dev info. */
+ per_dev->dev = dev;
+ if (dev == si->dev) {
+ WARN_ON(dev == si->par_dev);
+ per_dev->offset = si->obj_offset;
+ cur_len = stripe_unit - si->unit_off;
+ page_off = si->unit_off & ~PAGE_MASK;
+ BUG_ON(page_off && (page_off != ios->pgbase));
+ } else {
+ per_dev->offset = si->obj_offset - si->unit_off;
+ cur_len = stripe_unit;
+ }
+ } else {
+ cur_len = stripe_unit;
+ }
+ if (cur_len >= length)
+ cur_len = length;
+
+ ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+ per_dev, cur_len);
+ if (unlikely(ret))
+ goto out;
+
+ length -= cur_len;
+
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+ si->cur_comp = (si->cur_comp + 1) % group_width;
+ if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+ if (!length && ios->sp2d) {
+ /* If we are writing and this is the very last
+ * stripe. then operate on parity dev.
+ */
+ dev = si->par_dev;
+ /* If last stripe operate on parity comp */
+ si->cur_comp = group_width - ios->layout->parity;
+ }
+
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ ret = _add_parity_units(ios, si, dev, first_dev,
+ mirrors_p1, devs_in_group,
+ ios->sp2d ? length : cur_len);
+ if (unlikely(ret))
+ goto out;
+
+ /* Rotate next par_dev backwards with wraping */
+ si->par_dev = (devs_in_group + si->par_dev -
+ ios->layout->parity * mirrors_p1) %
+ devs_in_group + first_dev;
+ /* Next stripe, start fresh */
+ si->cur_comp = 0;
+ si->cur_pg = 0;
+ si->obj_offset += cur_len;
+ si->unit_off = 0;
+ }
+ }
+out:
+ ios->numdevs = devs_in_group;
+ ios->pages_consumed = cur_pg;
+ return ret;
+}
+
+int ore_create(struct ore_io_state *ios)
+{
+ int i, ret;
+
+ for (i = 0; i < ios->oc->numdevs; i++) {
+ struct osd_request *or;
+
+ or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ORE_ERR("%s: osd_start_request failed\n", __func__);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ios->per_dev[i].or = or;
+ ios->numdevs++;
+
+ osd_req_create_object(or, _ios_obj(ios, i));
+ }
+ ret = ore_io_execute(ios);
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ore_create);
+
+int ore_remove(struct ore_io_state *ios)
+{
+ int i, ret;
+
+ for (i = 0; i < ios->oc->numdevs; i++) {
+ struct osd_request *or;
+
+ or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ORE_ERR("%s: osd_start_request failed\n", __func__);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ios->per_dev[i].or = or;
+ ios->numdevs++;
+
+ osd_req_remove_object(or, _ios_obj(ios, i));
+ }
+ ret = ore_io_execute(ios);
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ore_remove);
+
+static int _write_mirror(struct ore_io_state *ios, int cur_comp)
+{
+ struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+ unsigned dev = ios->per_dev[cur_comp].dev;
+ unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+ int ret = 0;
+
+ if (ios->pages && !master_dev->length)
+ return 0; /* Just an empty slot */
+
+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+ struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+ struct osd_request *or;
+
+ or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ORE_ERR("%s: osd_start_request failed\n", __func__);
+ ret = -ENOMEM;
+ goto out;
+ }
+ per_dev->or = or;
+
+ if (ios->pages) {
+ struct bio *bio;
+
+ if (per_dev != master_dev) {
+ bio = bio_clone_kmalloc(master_dev->bio,
+ GFP_KERNEL);
+ if (unlikely(!bio)) {
+ ORE_DBGMSG(
+ "Failed to allocate BIO size=%u\n",
+ master_dev->bio->bi_max_vecs);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ bio->bi_bdev = NULL;
+ bio->bi_next = NULL;
+ per_dev->offset = master_dev->offset;
+ per_dev->length = master_dev->length;
+ per_dev->bio = bio;
+ per_dev->dev = dev;
+ } else {
+ bio = master_dev->bio;
+ /* FIXME: bio_set_dir() */
+ bio->bi_rw |= REQ_WRITE;
+ }
+
+ osd_req_write(or, _ios_obj(ios, cur_comp),
+ per_dev->offset, bio, per_dev->length);
+ ORE_DBGMSG("write(0x%llx) offset=0x%llx "
+ "length=0x%llx dev=%d\n",
+ _LLU(_ios_obj(ios, cur_comp)->id),
+ _LLU(per_dev->offset),
+ _LLU(per_dev->length), dev);
+ } else if (ios->kern_buff) {
+ per_dev->offset = ios->si.obj_offset;
+ per_dev->dev = ios->si.dev + dev;
+
+ /* no cross device without page array */
+ BUG_ON((ios->layout->group_width > 1) &&
+ (ios->si.unit_off + ios->length >
+ ios->layout->stripe_unit));
+
+ ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
+ per_dev->offset,
+ ios->kern_buff, ios->length);
+ if (unlikely(ret))
+ goto out;
+ ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+ "length=0x%llx dev=%d\n",
+ _LLU(_ios_obj(ios, cur_comp)->id),
+ _LLU(per_dev->offset),
+ _LLU(ios->length), per_dev->dev);
+ } else {
+ osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
+ ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+ _LLU(_ios_obj(ios, cur_comp)->id),
+ ios->out_attr_len, dev);
+ }
+
+ if (ios->out_attr)
+ osd_req_add_set_attr_list(or, ios->out_attr,
+ ios->out_attr_len);
+
+ if (ios->in_attr)
+ osd_req_add_get_attr_list(or, ios->in_attr,
+ ios->in_attr_len);
+ }
+
+out:
+ return ret;
+}
+
+int ore_write(struct ore_io_state *ios)
+{
+ int i;
+ int ret;
+
+ if (unlikely(ios->sp2d && !ios->r4w)) {
+ /* A library is attempting a RAID-write without providing
+ * a pages lock interface.
+ */
+ WARN_ON_ONCE(1);
+ return -ENOTSUPP;
+ }
+
+ ret = _prepare_for_striping(ios);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ ret = _write_mirror(ios, i);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ ret = ore_io_execute(ios);
+ return ret;
+}
+EXPORT_SYMBOL(ore_write);
+
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+{
+ struct osd_request *or;
+ struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+ struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
+ unsigned first_dev = (unsigned)obj->id;
+
+ if (ios->pages && !per_dev->length)
+ return 0; /* Just an empty slot */
+
+ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
+ or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ORE_ERR("%s: osd_start_request failed\n", __func__);
+ return -ENOMEM;
+ }
+ per_dev->or = or;
+
+ if (ios->pages) {
+ if (per_dev->cur_sg) {
+ /* finalize the last sg_entry */
+ _ore_add_sg_seg(per_dev, 0, false);
+ if (unlikely(!per_dev->cur_sg))
+ return 0; /* Skip parity only device */
+
+ osd_req_read_sg(or, obj, per_dev->bio,
+ per_dev->sglist, per_dev->cur_sg);
+ } else {
+ /* The no raid case */
+ osd_req_read(or, obj, per_dev->offset,
+ per_dev->bio, per_dev->length);
+ }
+
+ ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+ " dev=%d sg_len=%d\n", _LLU(obj->id),
+ _LLU(per_dev->offset), _LLU(per_dev->length),
+ first_dev, per_dev->cur_sg);
+ } else {
+ BUG_ON(ios->kern_buff);
+
+ osd_req_get_attributes(or, obj);
+ ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+ _LLU(obj->id),
+ ios->in_attr_len, first_dev);
+ }
+ if (ios->out_attr)
+ osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
+
+ if (ios->in_attr)
+ osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
+
+ return 0;
+}
+
+int ore_read(struct ore_io_state *ios)
+{
+ int i;
+ int ret;
+
+ ret = _prepare_for_striping(ios);
+ if (unlikely(ret))
+ return ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+ ret = _ore_read_mirror(ios, i);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ ret = ore_io_execute(ios);
+ return ret;
+}
+EXPORT_SYMBOL(ore_read);
+
+int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
+{
+ struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+ void *iter = NULL;
+ int nelem;
+
+ do {
+ nelem = 1;
+ osd_req_decode_get_attr_list(ios->per_dev[0].or,
+ &cur_attr, &nelem, &iter);
+ if ((cur_attr.attr_page == attr->attr_page) &&
+ (cur_attr.attr_id == attr->attr_id)) {
+ attr->len = cur_attr.len;
+ attr->val_ptr = cur_attr.val_ptr;
+ return 0;
+ }
+ } while (iter);
+
+ return -EIO;
+}
+EXPORT_SYMBOL(extract_attr_from_ios);
+
+static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
+ struct osd_attr *attr)
+{
+ int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+ for (; cur_comp < last_comp; ++cur_comp) {
+ struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+ struct osd_request *or;
+
+ or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
+ if (unlikely(!or)) {
+ ORE_ERR("%s: osd_start_request failed\n", __func__);
+ return -ENOMEM;
+ }
+ per_dev->or = or;
+
+ osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
+ osd_req_add_set_attr_list(or, attr, 1);
+ }
+
+ return 0;
+}
+
+struct _trunc_info {
+ struct ore_striping_info si;
+ u64 prev_group_obj_off;
+ u64 next_group_obj_off;
+
+ unsigned first_group_dev;
+ unsigned nex_group_dev;
+};
+
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+ struct _trunc_info *ti)
+{
+ unsigned stripe_unit = layout->stripe_unit;
+
+ ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
+
+ ti->prev_group_obj_off = ti->si.M * stripe_unit;
+ ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
+
+ ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
+ ti->nex_group_dev = ti->first_group_dev + layout->group_width;
+}
+
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
+ u64 size)
+{
+ struct ore_io_state *ios;
+ struct exofs_trunc_attr {
+ struct osd_attr attr;
+ __be64 newsize;
+ } *size_attrs;
+ struct _trunc_info ti;
+ int i, ret;
+
+ ret = ore_get_io_state(layout, oc, &ios);
+ if (unlikely(ret))
+ return ret;
+
+ _calc_trunk_info(ios->layout, size, &ti);
+
+ size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
+ GFP_KERNEL);
+ if (unlikely(!size_attrs)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ios->numdevs = ios->oc->numdevs;
+
+ for (i = 0; i < ios->numdevs; ++i) {
+ struct exofs_trunc_attr *size_attr = &size_attrs[i];
+ u64 obj_size;
+
+ if (i < ti.first_group_dev)
+ obj_size = ti.prev_group_obj_off;
+ else if (i >= ti.nex_group_dev)
+ obj_size = ti.next_group_obj_off;
+ else if (i < ti.si.dev) /* dev within this group */
+ obj_size = ti.si.obj_offset +
+ ios->layout->stripe_unit - ti.si.unit_off;
+ else if (i == ti.si.dev)
+ obj_size = ti.si.obj_offset;
+ else /* i > ti.dev */
+ obj_size = ti.si.obj_offset - ti.si.unit_off;
+
+ size_attr->newsize = cpu_to_be64(obj_size);
+ size_attr->attr = g_attr_logical_length;
+ size_attr->attr.val_ptr = &size_attr->newsize;
+
+ ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+ _LLU(oc->comps->obj.id), _LLU(obj_size), i);
+ ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+ &size_attr->attr);
+ if (unlikely(ret))
+ goto out;
+ }
+ ret = ore_io_execute(ios);
+
+out:
+ kfree(size_attrs);
+ ore_put_io_state(ios);
+ return ret;
+}
+EXPORT_SYMBOL(ore_truncate);
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+ OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 00000000000..7f20f25c232
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+static struct page *_raid_page_alloc(void)
+{
+ return alloc_page(GFP_KERNEL);
+}
+
+static void _raid_page_free(struct page *p)
+{
+ __free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+ /* Cache some hot path repeated calculations */
+ unsigned parity;
+ unsigned data_devs;
+ unsigned pages_in_unit;
+
+ bool needed ;
+
+ /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+ struct __1_page_stripe {
+ bool alloc;
+ unsigned write_count;
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+
+ /* The size of this array is data_devs + parity */
+ struct page **pages;
+ struct page **scribble;
+ /* bool array, size of this array is data_devs */
+ char *page_is_read;
+ } _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+ unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+ struct __stripe_pages_2d *sp2d;
+ unsigned data_devs = group_width - parity;
+ struct _alloc_all_bytes {
+ struct __alloc_stripe_pages_2d {
+ struct __stripe_pages_2d sp2d;
+ struct __1_page_stripe _1p_stripes[pages_in_unit];
+ } __asp2d;
+ struct __alloc_1p_arrays {
+ struct page *pages[group_width];
+ struct page *scribble[group_width];
+ char page_is_read[data_devs];
+ } __a1pa[pages_in_unit];
+ } *_aab;
+ struct __alloc_1p_arrays *__a1pa;
+ struct __alloc_1p_arrays *__a1pa_end;
+ const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+ unsigned num_a1pa, alloc_size, i;
+
+ /* FIXME: check these numbers in ore_verify_layout */
+ BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+ BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+ if (sizeof(*_aab) > PAGE_SIZE) {
+ num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+ alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+ } else {
+ num_a1pa = pages_in_unit;
+ alloc_size = sizeof(*_aab);
+ }
+
+ _aab = kzalloc(alloc_size, GFP_KERNEL);
+ if (unlikely(!_aab)) {
+ ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+ return -ENOMEM;
+ }
+
+ sp2d = &_aab->__asp2d.sp2d;
+ *psp2d = sp2d; /* From here Just call _sp2d_free */
+
+ __a1pa = _aab->__a1pa;
+ __a1pa_end = __a1pa + num_a1pa;
+
+ for (i = 0; i < pages_in_unit; ++i) {
+ if (unlikely(__a1pa >= __a1pa_end)) {
+ num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+ pages_in_unit - i);
+
+ __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+ if (unlikely(!__a1pa)) {
+ ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+ num_a1pa);
+ return -ENOMEM;
+ }
+ __a1pa_end = __a1pa + num_a1pa;
+ /* First *pages is marked for kfree of the buffer */
+ sp2d->_1p_stripes[i].alloc = true;
+ }
+
+ sp2d->_1p_stripes[i].pages = __a1pa->pages;
+ sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+ sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+ ++__a1pa;
+ }
+
+ sp2d->parity = parity;
+ sp2d->data_devs = data_devs;
+ sp2d->pages_in_unit = pages_in_unit;
+ return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+ const struct _ore_r4w_op *r4w, void *priv)
+{
+ unsigned data_devs = sp2d->data_devs;
+ unsigned group_width = data_devs + sp2d->parity;
+ int p, c;
+
+ if (!sp2d->needed)
+ return;
+
+ for (c = data_devs - 1; c >= 0; --c)
+ for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->page_is_read[c]) {
+ struct page *page = _1ps->pages[c];
+
+ r4w->put_page(priv, page);
+ _1ps->page_is_read[c] = false;
+ }
+ }
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+ _1ps->write_count = 0;
+ _1ps->tx = NULL;
+ }
+
+ sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+ unsigned i;
+
+ if (!sp2d)
+ return;
+
+ for (i = 0; i < sp2d->pages_in_unit; ++i) {
+ if (sp2d->_1p_stripes[i].alloc)
+ kfree(sp2d->_1p_stripes[i].pages);
+ }
+
+ kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+ int p;
+
+ for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (_1ps->write_count)
+ return p;
+ }
+
+ return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+ unsigned p;
+ unsigned tx_flags = ASYNC_TX_ACK;
+
+ if (sp2d->parity == 1)
+ tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if (!_1ps->write_count)
+ continue;
+
+ init_async_submit(&_1ps->submit, tx_flags,
+ NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
+
+ if (sp2d->parity == 1)
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+ _1ps->pages, 0, sp2d->data_devs,
+ PAGE_SIZE, &_1ps->submit);
+ else /* parity == 2 */
+ _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+ sp2d->data_devs + sp2d->parity,
+ PAGE_SIZE, &_1ps->submit);
+ }
+
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ /* NOTE: We wait for HW synchronously (I don't have such HW
+ * to test with.) Is parallelism needed with today's multi
+ * cores?
+ */
+ async_tx_issue_pending(_1ps->tx);
+ }
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ struct __1_page_stripe *_1ps;
+
+ sp2d->needed = true;
+
+ _1ps = &sp2d->_1p_stripes[si->cur_pg];
+ _1ps->pages[si->cur_comp] = page;
+ ++_1ps->write_count;
+
+ si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+ /* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last)
+{
+ struct osd_sg_entry *sge;
+
+ ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+ "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+ per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+ _LLU(per_dev->offset), per_dev->length,
+ per_dev->last_sgs_total);
+
+ if (!per_dev->cur_sg) {
+ sge = per_dev->sglist;
+
+ /* First time we prepare two entries */
+ if (per_dev->length) {
+ ++per_dev->cur_sg;
+ sge->offset = per_dev->offset;
+ sge->len = per_dev->length;
+ } else {
+ /* Here the parity is the first unit of this object.
+ * This happens every time we reach a parity device on
+ * the same stripe as the per_dev->offset. We need to
+ * just skip this unit.
+ */
+ per_dev->offset += cur_len;
+ return;
+ }
+ } else {
+ /* finalize the last one */
+ sge = &per_dev->sglist[per_dev->cur_sg - 1];
+ sge->len = per_dev->length - per_dev->last_sgs_total;
+ }
+
+ if (not_last) {
+ /* Partly prepare the next one */
+ struct osd_sg_entry *next_sge = sge + 1;
+
+ ++per_dev->cur_sg;
+ next_sge->offset = sge->offset + sge->len + cur_len;
+ /* Save cur len so we know how mutch was added next time */
+ per_dev->last_sgs_total = per_dev->length;
+ next_sge->len = 0;
+ } else if (!sge->len) {
+ /* Optimize for when the last unit is a parity */
+ --per_dev->cur_sg;
+ }
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+ struct ore_layout *layout = ios->layout;
+ int ret;
+ /* We want to only read those pages not in cache so worst case
+ * is a stripe populated with every other page
+ */
+ unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+ ret = _ore_get_io_state(layout, ios->oc,
+ layout->group_width * layout->mirrors_p1,
+ sgs_per_dev, 0, &ios->ios_read_4_write);
+ return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
+ struct page *page, unsigned pg_len)
+{
+ struct request_queue *q;
+ struct ore_per_dev_state *per_dev;
+ struct ore_io_state *read_ios;
+ unsigned first_dev = si->dev - (si->dev %
+ (ios->layout->group_width * ios->layout->mirrors_p1));
+ unsigned comp = si->dev - first_dev;
+ unsigned added_len;
+
+ if (!ios->ios_read_4_write) {
+ int ret = _alloc_read_4_write(ios);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ read_ios = ios->ios_read_4_write;
+ read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+ per_dev = &read_ios->per_dev[comp];
+ if (!per_dev->length) {
+ per_dev->bio = bio_kmalloc(GFP_KERNEL,
+ ios->sp2d->pages_in_unit);
+ if (unlikely(!per_dev->bio)) {
+ ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+ ios->sp2d->pages_in_unit);
+ return -ENOMEM;
+ }
+ per_dev->offset = si->obj_offset;
+ per_dev->dev = si->dev;
+ } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+ u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+ _ore_add_sg_seg(per_dev, gap, true);
+ }
+ q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+ added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
+ si->obj_offset % PAGE_SIZE);
+ if (unlikely(added_len != pg_len)) {
+ ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+ per_dev->bio->bi_vcnt);
+ return -ENOMEM;
+ }
+
+ per_dev->length += pg_len;
+ return 0;
+}
+
+/* read the beginning of an unaligned first page */
+static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
+{
+ struct ore_striping_info si;
+ unsigned pg_len;
+
+ ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
+
+ pg_len = si.obj_offset % PAGE_SIZE;
+ si.obj_offset -= pg_len;
+
+ ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
+ _LLU(si.obj_offset), pg_len, page->index, si.dev);
+
+ return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+/* read the end of an incomplete last page */
+static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
+{
+ struct ore_striping_info si;
+ struct page *page;
+ unsigned pg_len, p, c;
+
+ ore_calc_stripe_info(ios->layout, *offset, 0, &si);
+
+ p = si.cur_pg;
+ c = si.cur_comp;
+ page = ios->sp2d->_1p_stripes[p].pages[c];
+
+ pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
+ *offset += pg_len;
+
+ ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
+ p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
+
+ BUG_ON(!page);
+
+ return _add_to_r4w(ios, &si, page, pg_len);
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+ struct bio_vec *bv;
+ unsigned i, d;
+
+ /* loop on all devices all pages */
+ for (d = 0; d < ios->numdevs; d++) {
+ struct bio *bio = ios->per_dev[d].bio;
+
+ if (!bio)
+ continue;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ }
+ }
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write_first_stripe(struct ore_io_state *ios)
+{
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset = ios->si.first_stripe_start;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
+ if (offset == ios->offset) /* Go to start collect $200 */
+ goto read_last_stripe;
+
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+
+ ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
+ offset, ios->offset, min_p, max_p);
+
+ for (c = 0; ; c++) {
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ read_si.obj_offset += min_p * PAGE_SIZE;
+ offset += min_p * PAGE_SIZE;
+ for (p = min_p; p <= max_p; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+ struct page **pp = &_1ps->pages[c];
+ bool uptodate;
+
+ if (*pp) {
+ if (ios->offset % PAGE_SIZE)
+ /* Read the remainder of the page */
+ _add_to_r4w_first_page(ios, *pp);
+ /* to-be-written pages start here */
+ goto read_last_stripe;
+ }
+
+ *pp = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!*pp))
+ return -ENOMEM;
+
+ if (!uptodate)
+ _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
+
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ read_si.obj_offset += PAGE_SIZE;
+ offset += PAGE_SIZE;
+ }
+ offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+ }
+
+read_last_stripe:
+ return 0;
+}
+
+static int _read_4_write_last_stripe(struct ore_io_state *ios)
+{
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset;
+ u64 last_stripe_end;
+ unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
+ offset = ios->offset + ios->length;
+ if (offset % PAGE_SIZE)
+ _add_to_r4w_last_page(ios, &offset);
+ /* offset will be aligned to next page */
+
+ last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+ * bytes_in_stripe;
+ if (offset == last_stripe_end) /* Optimize for the aligned case */
+ goto read_it;
+
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ p = read_si.cur_pg;
+ c = read_si.cur_comp;
+
+ if (min_p == sp2d->pages_in_unit) {
+ /* Didn't do it yet */
+ min_p = _sp2d_min_pg(sp2d);
+ max_p = _sp2d_max_pg(sp2d);
+ }
+
+ ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
+ offset, last_stripe_end, min_p, max_p);
+
+ while (offset < last_stripe_end) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+ if ((min_p <= p) && (p <= max_p)) {
+ struct page *page;
+ bool uptodate;
+
+ BUG_ON(_1ps->pages[c]);
+ page = ios->r4w->get_page(ios->private, offset,
+ &uptodate);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ _1ps->pages[c] = page;
+ /* Mark read-pages to be cache_released */
+ _1ps->page_is_read[c] = true;
+ if (!uptodate)
+ _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
+ }
+
+ offset += PAGE_SIZE;
+ if (p == (sp2d->pages_in_unit - 1)) {
+ ++c;
+ p = 0;
+ ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+ } else {
+ read_si.obj_offset += PAGE_SIZE;
+ ++p;
+ }
+ }
+
+read_it:
+ return 0;
+}
+
+static int _read_4_write_execute(struct ore_io_state *ios)
+{
+ struct ore_io_state *ios_read;
+ unsigned i;
+ int ret;
+
+ ios_read = ios->ios_read_4_write;
+ if (!ios_read)
+ return 0;
+
+ /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+ * to check for per_dev->bio
+ */
+ ios_read->pages = ios->pages;
+
+ /* Now read these devices */
+ for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+ ret = _ore_read_mirror(ios_read, i);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ ret = ore_io_execute(ios_read); /* Synchronus execution */
+ if (unlikely(ret)) {
+ ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+ return ret;
+ }
+
+ _mark_read4write_pages_uptodate(ios_read, ret);
+ ore_put_io_state(ios_read);
+ ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
+ return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev,
+ unsigned cur_len, bool do_xor)
+{
+ if (ios->reading) {
+ if (per_dev->cur_sg >= ios->sgs_per_dev) {
+ ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
+ per_dev->cur_sg, ios->sgs_per_dev);
+ return -ENOMEM;
+ }
+ _ore_add_sg_seg(per_dev, cur_len, true);
+ } else {
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ struct page **pages = ios->parity_pages + ios->cur_par_page;
+ unsigned num_pages;
+ unsigned array_start = 0;
+ unsigned i;
+ int ret;
+
+ si->cur_pg = _sp2d_min_pg(sp2d);
+ num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+ if (!per_dev->length) {
+ per_dev->offset += si->cur_pg * PAGE_SIZE;
+ /* If first stripe, Read in all read4write pages
+ * (if needed) before we calculate the first parity.
+ */
+ if (do_xor)
+ _read_4_write_first_stripe(ios);
+ }
+ if (!cur_len && do_xor)
+ /* If last stripe r4w pages of last stripe */
+ _read_4_write_last_stripe(ios);
+ _read_4_write_execute(ios);
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = _raid_page_alloc();
+ if (unlikely(!pages[i]))
+ return -ENOMEM;
+
+ ++(ios->cur_par_page);
+ }
+
+ BUG_ON(si->cur_comp < sp2d->data_devs);
+ BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+ ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
+ per_dev, num_pages * PAGE_SIZE);
+ if (unlikely(ret))
+ return ret;
+
+ if (do_xor) {
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
+ }
+ return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+ if (ios->parity_pages) {
+ struct ore_layout *layout = ios->layout;
+ unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+
+ if (_sp2d_alloc(pages_in_unit, layout->group_width,
+ layout->parity, &ios->sp2d)) {
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+ if (ios->sp2d) { /* writing and raid */
+ unsigned i;
+
+ for (i = 0; i < ios->cur_par_page; i++) {
+ struct page *page = ios->parity_pages[i];
+
+ if (page)
+ _raid_page_free(page);
+ }
+ if (ios->extra_part_alloc)
+ kfree(ios->parity_pages);
+ /* If IO returned an error pages might need unlocking */
+ _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+ _sp2d_free(ios->sp2d);
+ } else {
+ /* Will only be set if raid reading && sglist is big */
+ if (ios->extra_part_alloc)
+ kfree(ios->per_dev[0].sglist);
+ }
+ if (ios->ios_read_4_write)
+ ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 00000000000..cf6375d8212
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ * "Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+ printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+ do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+ struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool do_xor);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+ struct ore_striping_info *si, struct page *page)
+{
+ if (!sp2d) /* Inline the fast path */
+ return; /* Hay no raid stuff */
+ _ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int _ore_get_io_state(struct ore_layout *layout,
+ struct ore_components *oc, unsigned numdevs,
+ unsigned sgs_per_dev, unsigned num_par_pages,
+ struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct page **pages,
+ struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
deleted file mode 100644
index c52e9888b8a..00000000000
--- a/fs/exofs/pnfs.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- */
-
-/* FIXME: Remove this file once pnfs hits mainline */
-
-#ifndef __EXOFS_PNFS_H__
-#define __EXOFS_PNFS_H__
-
-#if ! defined(__PNFS_OSD_XDR_H__)
-
-enum pnfs_iomode {
- IOMODE_READ = 1,
- IOMODE_RW = 2,
- IOMODE_ANY = 3,
-};
-
-/* Layout Structure */
-enum pnfs_osd_raid_algorithm4 {
- PNFS_OSD_RAID_0 = 1,
- PNFS_OSD_RAID_4 = 2,
- PNFS_OSD_RAID_5 = 3,
- PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
-};
-
-struct pnfs_osd_data_map {
- u32 odm_num_comps;
- u64 odm_stripe_unit;
- u32 odm_group_width;
- u32 odm_group_depth;
- u32 odm_mirror_cnt;
- u32 odm_raid_algorithm;
-};
-
-#endif /* ! defined(__PNFS_OSD_XDR_H__) */
-
-#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b38..ed73ed8ebbe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,11 +35,14 @@
#include <linux/parser.h>
#include <linux/vfs.h>
#include <linux/random.h>
+#include <linux/module.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
#include "exofs.h"
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+
/******************************************************************************
* MOUNT OPTIONS
*****************************************************************************/
@@ -48,6 +51,7 @@
* struct to hold what we get from mount options
*/
struct exofs_mountopt {
+ bool is_osdname;
const char *dev_name;
uint64_t pid;
int timeout;
@@ -56,7 +60,7 @@ struct exofs_mountopt {
/*
* exofs-specific mount-time options.
*/
-enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
/*
* Our mount-time options. These should ideally be 64-bit unsigned, but the
@@ -64,6 +68,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
* sufficient for most applications now.
*/
static match_table_t tokens = {
+ {Opt_name, "osdname=%s"},
{Opt_pid, "pid=%u"},
{Opt_to, "to=%u"},
{Opt_err, NULL}
@@ -94,6 +99,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
token = match_token(p, tokens, args);
switch (token) {
+ case Opt_name:
+ opts->dev_name = match_strdup(&args[0]);
+ if (unlikely(!opts->dev_name)) {
+ EXOFS_ERR("Error allocating dev_name");
+ return -ENOMEM;
+ }
+ opts->is_osdname = true;
+ break;
case Opt_pid:
if (0 == match_strlcpy(str, &args[0], sizeof(str)))
return -EINVAL;
@@ -153,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
static void exofs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
}
@@ -194,10 +206,152 @@ static int init_inodecache(void)
*/
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(exofs_inode_cachep);
}
/******************************************************************************
+ * Some osd helpers
+ *****************************************************************************/
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+ osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+ u64 offset, void *p, unsigned length)
+{
+ struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/* struct osd_sense_info osi = {.key = 0};*/
+ int ret;
+
+ if (unlikely(!or)) {
+ EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+ return -ENOMEM;
+ }
+ ret = osd_req_read_kern(or, obj, offset, p, length);
+ if (unlikely(ret)) {
+ EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+ goto out;
+ }
+
+ ret = osd_finalize_request(or, 0, cred, NULL);
+ if (unlikely(ret)) {
+ EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
+ goto out;
+ }
+
+ ret = osd_execute_request(or);
+ if (unlikely(ret))
+ EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+ /* osd_req_decode_sense(or, ret); */
+
+out:
+ osd_end_request(or);
+ EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+ "length=0x%llx dev=%p ret=>%d\n",
+ _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
+ return ret;
+}
+
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+ EXOFS_APAGE_SB_DATA,
+ EXOFS_ATTR_SB_STATS,
+ sizeof(struct exofs_sb_stats));
+
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+ struct osd_attr attrs[] = {
+ [0] = g_attr_sb_stats,
+ };
+ struct ore_io_state *ios;
+ int ret;
+
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
+ if (unlikely(ret)) {
+ EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+ return ret;
+ }
+
+ ios->in_attr = attrs;
+ ios->in_attr_len = ARRAY_SIZE(attrs);
+
+ ret = ore_read(ios);
+ if (unlikely(ret)) {
+ EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+ goto out;
+ }
+
+ ret = extract_attr_from_ios(ios, &attrs[0]);
+ if (ret) {
+ EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+ goto out;
+ }
+ if (attrs[0].len) {
+ struct exofs_sb_stats *ess;
+
+ if (unlikely(attrs[0].len != sizeof(*ess))) {
+ EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+ "size(%d) != expected(%zd)\n",
+ __func__, attrs[0].len, sizeof(*ess));
+ goto out;
+ }
+
+ ess = attrs[0].val_ptr;
+ sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+ sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+ }
+
+out:
+ ore_put_io_state(ios);
+ return ret;
+}
+
+static void stats_done(struct ore_io_state *ios, void *p)
+{
+ ore_put_io_state(ios);
+ /* Good thanks nothing to do anymore */
+}
+
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+ struct osd_attr attrs[] = {
+ [0] = g_attr_sb_stats,
+ };
+ struct ore_io_state *ios;
+ int ret;
+
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
+ if (unlikely(ret)) {
+ EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
+ return ret;
+ }
+
+ sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
+ sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+ attrs[0].val_ptr = &sbi->s_ess;
+
+
+ ios->done = stats_done;
+ ios->private = sbi;
+ ios->out_attr = attrs;
+ ios->out_attr_len = ARRAY_SIZE(attrs);
+
+ ret = ore_write(ios);
+ if (unlikely(ret)) {
+ EXOFS_ERR("%s: ore_write failed.\n", __func__);
+ ore_put_io_state(ios);
+ }
+
+ return ret;
+}
+
+/******************************************************************************
* SUPERBLOCK FUNCTIONS
*****************************************************************************/
static const struct super_operations exofs_sops;
@@ -206,60 +360,57 @@ static const struct export_operations exofs_export_ops;
/*
* Write the superblock to the OSD
*/
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
{
struct exofs_sb_info *sbi;
struct exofs_fscb *fscb;
- struct exofs_io_state *ios;
+ struct ore_comp one_comp;
+ struct ore_components oc;
+ struct ore_io_state *ios;
int ret = -ENOMEM;
- lock_super(sb);
+ fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+ if (unlikely(!fscb))
+ return -ENOMEM;
+
sbi = sb->s_fs_info;
- fscb = &sbi->s_fscb;
- ret = exofs_get_io_state(&sbi->layout, &ios);
- if (ret)
+ /* NOTE: We no longer dirty the super_block anywhere in exofs. The
+ * reason we write the fscb here on unmount is so we can stay backwards
+ * compatible with fscb->s_version == 1. (What we are not compatible
+ * with is if a new version FS crashed and then we try to mount an old
+ * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+ * the writeable info is set in exofs_sbi_write_stats() above.
+ */
+
+ exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
+
+ ret = ore_get_io_state(&sbi->layout, &oc, &ios);
+ if (unlikely(ret))
goto out;
- /* Note: We only write the changing part of the fscb. .i.e upto the
- * the fscb->s_dev_table_oid member. There is no read-modify-write
- * here.
- */
ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
memset(fscb, 0, ios->length);
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+ fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
fscb->s_magic = cpu_to_le16(sb->s_magic);
fscb->s_newfs = 0;
fscb->s_version = EXOFS_FSCB_VER;
- ios->obj.id = EXOFS_SUPER_ID;
ios->offset = 0;
ios->kern_buff = fscb;
- ios->cred = sbi->s_cred;
- ret = exofs_sbi_write(ios);
- if (unlikely(ret)) {
- EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
- goto out;
- }
- sb->s_dirt = 0;
+ ret = ore_write(ios);
+ if (unlikely(ret))
+ EXOFS_ERR("%s: ore_write failed.\n", __func__);
out:
EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
- exofs_put_io_state(ios);
- unlock_super(sb);
+ ore_put_io_state(ios);
+ kfree(fscb);
return ret;
}
-static void exofs_write_super(struct super_block *sb)
-{
- if (!(sb->s_flags & MS_RDONLY))
- exofs_sync_fs(sb, 1);
- else
- sb->s_dirt = 0;
-}
-
static void _exofs_print_device(const char *msg, const char *dev_path,
struct osd_dev *od, u64 pid)
{
@@ -269,17 +420,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
msg, dev_path ?: "", odi->osdname, _LLU(pid));
}
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
{
- while (sbi->layout.s_numdevs) {
- int i = --sbi->layout.s_numdevs;
- struct osd_dev *od = sbi->layout.s_ods[i];
+ unsigned numdevs = sbi->oc.numdevs;
+
+ while (numdevs) {
+ unsigned i = --numdevs;
+ struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
if (od) {
- sbi->layout.s_ods[i] = NULL;
+ ore_comp_set_dev(&sbi->oc, i, NULL);
osduld_put_device(od);
}
}
+ kfree(sbi->oc.ods);
kfree(sbi);
}
@@ -292,22 +446,24 @@ static void exofs_put_super(struct super_block *sb)
int num_pend;
struct exofs_sb_info *sbi = sb->s_fs_info;
- if (sb->s_dirt)
- exofs_write_super(sb);
-
/* make sure there are no pending commands */
for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
num_pend = atomic_read(&sbi->s_curr_pending)) {
wait_queue_head_t wq;
+
+ printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+ "This is a BUG. please report to osd-dev@open-osd.org\n",
+ __func__);
init_waitqueue_head(&wq);
wait_event_timeout(wq,
(atomic_read(&sbi->s_curr_pending) == 0),
msecs_to_jiffies(100));
}
- _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
- sbi->layout.s_pid);
+ _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
+ sbi->one_comp.obj.partition);
+ exofs_sysfs_sb_del(sbi);
bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
@@ -316,78 +472,48 @@ static void exofs_put_super(struct super_block *sb)
static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_device_table *dt)
{
- u64 stripe_length;
+ int ret;
- sbi->data_map.odm_num_comps =
- le32_to_cpu(dt->dt_data_map.cb_num_comps);
- sbi->data_map.odm_stripe_unit =
+ sbi->layout.stripe_unit =
le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
- sbi->data_map.odm_group_width =
+ sbi->layout.group_width =
le32_to_cpu(dt->dt_data_map.cb_group_width);
- sbi->data_map.odm_group_depth =
+ sbi->layout.group_depth =
le32_to_cpu(dt->dt_data_map.cb_group_depth);
- sbi->data_map.odm_mirror_cnt =
- le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
- sbi->data_map.odm_raid_algorithm =
+ sbi->layout.mirrors_p1 =
+ le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+ sbi->layout.raid_algorithm =
le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-/* FIXME: Only raid0 for now. if not so, do not mount */
- if (sbi->data_map.odm_num_comps != numdevs) {
- EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
- sbi->data_map.odm_num_comps, numdevs);
- return -EINVAL;
- }
- if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
- EXOFS_ERR("Only RAID_0 for now\n");
- return -EINVAL;
- }
- if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
- EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
- numdevs, sbi->data_map.odm_mirror_cnt);
- return -EINVAL;
- }
-
- if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
- EXOFS_ERR("Stripe Unit(0x%llx)"
- " must be Multples of PAGE_SIZE(0x%lx)\n",
- _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
- return -EINVAL;
- }
+ ret = ore_verify_layout(numdevs, &sbi->layout);
+
+ EXOFS_DBGMSG("exofs: layout: "
+ "num_comps=%u stripe_unit=0x%x group_width=%u "
+ "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
+ numdevs,
+ sbi->layout.stripe_unit,
+ sbi->layout.group_width,
+ _LLU(sbi->layout.group_depth),
+ sbi->layout.mirrors_p1,
+ sbi->layout.raid_algorithm);
+ return ret;
+}
- sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
- sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+static unsigned __ra_pages(struct ore_layout *layout)
+{
+ const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+ unsigned ra_pages = layout->group_width * layout->stripe_unit /
+ PAGE_SIZE;
+ unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
- if (sbi->data_map.odm_group_width) {
- sbi->layout.group_width = sbi->data_map.odm_group_width;
- sbi->layout.group_depth = sbi->data_map.odm_group_depth;
- if (!sbi->layout.group_depth) {
- EXOFS_ERR("group_depth == 0 && group_width != 0\n");
- return -EINVAL;
- }
- sbi->layout.group_count = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1 /
- sbi->data_map.odm_group_width;
- } else {
- if (sbi->data_map.odm_group_depth) {
- printk(KERN_NOTICE "Warning: group_depth ignored "
- "group_width == 0 && group_depth == %d\n",
- sbi->data_map.odm_group_depth);
- sbi->data_map.odm_group_depth = 0;
- }
- sbi->layout.group_width = sbi->data_map.odm_num_comps /
- sbi->layout.mirrors_p1;
- sbi->layout.group_depth = -1;
- sbi->layout.group_count = 1;
- }
+ ra_pages *= 2; /* two stripes */
+ if (ra_pages < _MIN_RA)
+ ra_pages = roundup(_MIN_RA, ra_pages / 2);
- stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
- if (stripe_length >= (1ULL << 32)) {
- EXOFS_ERR("Total Stripe length(0x%llx)"
- " >= 32bit is not supported\n", _LLU(stripe_length));
- return -EINVAL;
- }
+ if (ra_pages > max_io_pages)
+ ra_pages = max_io_pages;
- return 0;
+ return ra_pages;
}
/* @odi is valid only as long as @fscb_dev is valid */
@@ -395,7 +521,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
struct osd_dev_info *odi)
{
odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+ if (likely(odi->systemid_len))
+ memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
odi->osdname = dt_dev->osdname;
@@ -416,14 +543,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
-static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+ struct exofs_dev **peds)
+{
+ struct __alloc_ore_devs_and_exofs_devs {
+ /* Twice bigger table: See exofs_init_comps() and comment at
+ * exofs_read_lookup_dev_table()
+ */
+ struct ore_dev *oreds[numdevs * 2 - 1];
+ struct exofs_dev eds[numdevs];
+ } *aoded;
+ struct exofs_dev *eds;
+ unsigned i;
+
+ aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+ if (unlikely(!aoded)) {
+ EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
+ numdevs);
+ return -ENOMEM;
+ }
+
+ sbi->oc.ods = aoded->oreds;
+ *peds = eds = aoded->eds;
+ for (i = 0; i < numdevs; ++i)
+ aoded->oreds[i] = &eds[i].ored;
+ return 0;
+}
+
+static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
+ struct osd_dev *fscb_od,
unsigned table_count)
{
- struct exofs_sb_info *sbi = *psbi;
- struct osd_dev *fscb_od;
- struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
- .id = EXOFS_DEVTABLE_ID};
+ struct ore_comp comp;
struct exofs_device_table *dt;
+ struct exofs_dev *eds;
unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
sizeof(*dt);
unsigned numdevs, i;
@@ -436,10 +589,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
return -ENOMEM;
}
- fscb_od = sbi->layout.s_ods[0];
- sbi->layout.s_ods[0] = NULL;
- sbi->layout.s_numdevs = 0;
- ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+ sbi->oc.numdevs = 0;
+
+ comp.obj.partition = sbi->one_comp.obj.partition;
+ comp.obj.id = EXOFS_DEVTABLE_ID;
+ exofs_make_credential(comp.cred, &comp.obj);
+
+ ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
+ table_bytes);
if (unlikely(ret)) {
EXOFS_ERR("ERROR: reading device table\n");
goto out;
@@ -456,18 +613,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
if (unlikely(ret))
goto out;
- if (likely(numdevs > 1)) {
- unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
+ ret = __alloc_dev_table(sbi, numdevs, &eds);
+ if (unlikely(ret))
+ goto out;
+ /* exofs round-robins the device table view according to inode
+ * number. We hold a: twice bigger table hence inodes can point
+ * to any device and have a sequential view of the table
+ * starting at this device. See exofs_init_comps()
+ */
+ memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+ (numdevs - 1) * sizeof(sbi->oc.ods[0]));
- sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
- if (unlikely(!sbi)) {
- ret = -ENOMEM;
- goto out;
- }
- memset(&sbi->layout.s_ods[1], 0,
- size - sizeof(sbi->layout.s_ods[0]));
- *psbi = sbi;
- }
+ /* create sysfs subdir under which we put the device table
+ * And cluster layout. A Superblock is identified by the string:
+ * "dev[0].osdname"_"pid"
+ */
+ exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
for (i = 0; i < numdevs; i++) {
struct exofs_fscb fscb;
@@ -483,32 +644,36 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
i, odi.osdname);
+ /* the exofs id is currently the table index */
+ eds[i].did = i;
+
/* On all devices the device table is identical. The user can
* specify any one of the participating devices on the command
* line. We always keep them in device-table order.
*/
if (fscb_od && osduld_device_same(fscb_od, &odi)) {
- sbi->layout.s_ods[i] = fscb_od;
- ++sbi->layout.s_numdevs;
+ eds[i].ored.od = fscb_od;
+ ++sbi->oc.numdevs;
fscb_od = NULL;
+ exofs_sysfs_odev_add(&eds[i], sbi);
continue;
}
od = osduld_info_lookup(&odi);
- if (unlikely(IS_ERR(od))) {
+ if (IS_ERR(od)) {
ret = PTR_ERR(od);
EXOFS_ERR("ERROR: device requested is not found "
"osd_name-%s =>%d\n", odi.osdname, ret);
goto out;
}
- sbi->layout.s_ods[i] = od;
- ++sbi->layout.s_numdevs;
+ eds[i].ored.od = od;
+ ++sbi->oc.numdevs;
/* Read the fscb of the other devices to make sure the FS
* partition is there.
*/
- ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+ ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
sizeof(fscb));
if (unlikely(ret)) {
EXOFS_ERR("ERROR: Malformed participating device "
@@ -516,6 +681,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
odi.osdname);
goto out;
}
+ exofs_sysfs_odev_add(&eds[i], sbi);
/* TODO: verify other information is correct and FS-uuid
* matches. Benny what did you say about device table
@@ -525,13 +691,11 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
out:
kfree(dt);
- if (unlikely(!ret && fscb_od)) {
- EXOFS_ERR(
- "ERROR: Bad device-table container device not present\n");
- osduld_put_device(fscb_od);
- ret = -EINVAL;
+ if (unlikely(fscb_od && !ret)) {
+ EXOFS_ERR("ERROR: Bad device-table container device not present\n");
+ osduld_put_device(fscb_od);
+ return -EINVAL;
}
-
return ret;
}
@@ -545,7 +709,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
struct exofs_sb_info *sbi; /*extended info */
struct osd_dev *od; /* Master device */
struct exofs_fscb fscb; /*on-disk superblock info */
- struct osd_obj_id obj;
+ struct ore_comp comp;
unsigned table_count;
int ret;
@@ -553,14 +717,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi)
return -ENOMEM;
- ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
- if (ret)
- goto free_bdi;
-
/* use mount options to fill superblock */
- od = osduld_path_lookup(opts->dev_name);
+ if (opts->is_osdname) {
+ struct osd_dev_info odi = {.systemid_len = 0};
+
+ odi.osdname_len = strlen(opts->dev_name);
+ odi.osdname = (u8 *)opts->dev_name;
+ od = osduld_info_lookup(&odi);
+ kfree(opts->dev_name);
+ opts->dev_name = NULL;
+ } else {
+ od = osduld_path_lookup(opts->dev_name);
+ }
if (IS_ERR(od)) {
- ret = PTR_ERR(od);
+ ret = -EINVAL;
goto free_sbi;
}
@@ -570,30 +740,35 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->layout.group_width = 1;
sbi->layout.group_depth = -1;
sbi->layout.group_count = 1;
- sbi->layout.s_ods[0] = od;
- sbi->layout.s_numdevs = 1;
- sbi->layout.s_pid = opts->pid;
sbi->s_timeout = opts->timeout;
+ sbi->one_comp.obj.partition = opts->pid;
+ sbi->one_comp.obj.id = 0;
+ exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
+ sbi->oc.single_comp = EC_SINGLE_COMP;
+ sbi->oc.comps = &sbi->one_comp;
+
/* fill in some other data by hand */
memset(sb->s_id, 0, sizeof(sb->s_id));
strcpy(sb->s_id, "exofs");
sb->s_blocksize = EXOFS_BLKSIZE;
sb->s_blocksize_bits = EXOFS_BLKSHIFT;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_max_links = EXOFS_LINK_MAX;
atomic_set(&sbi->s_curr_pending, 0);
sb->s_bdev = NULL;
sb->s_dev = 0;
- obj.partition = sbi->layout.s_pid;
- obj.id = EXOFS_SUPER_ID;
- exofs_make_credential(sbi->s_cred, &obj);
+ comp.obj.partition = sbi->one_comp.obj.partition;
+ comp.obj.id = EXOFS_SUPER_ID;
+ exofs_make_credential(comp.cred, &comp.obj);
- ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
+ ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
if (unlikely(ret))
goto free_sbi;
sb->s_magic = le16_to_cpu(fscb.s_magic);
+ /* NOTE: we read below to be backward compatible with old versions */
sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
@@ -604,7 +779,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
ret = -EINVAL;
goto free_sbi;
}
- if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+ if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
ret = -EINVAL;
@@ -617,12 +792,24 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
table_count = le64_to_cpu(fscb.s_dev_table_count);
if (table_count) {
- ret = exofs_read_lookup_dev_table(&sbi, table_count);
+ ret = exofs_read_lookup_dev_table(sbi, od, table_count);
+ if (unlikely(ret))
+ goto free_sbi;
+ } else {
+ struct exofs_dev *eds;
+
+ ret = __alloc_dev_table(sbi, 1, &eds);
if (unlikely(ret))
goto free_sbi;
+
+ ore_comp_set_dev(&sbi->oc, 0, od);
+ sbi->oc.numdevs = 1;
}
+ __sbi_read_stats(sbi);
+
/* set up operation vectors */
+ sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
sb->s_bdi = &sbi->bdi;
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
@@ -633,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
ret = PTR_ERR(root);
goto free_sbi;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
EXOFS_ERR("ERROR: get root inode failed\n");
ret = -ENOMEM;
goto free_sbi;
@@ -650,15 +836,23 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
- sbi->layout.s_pid);
+ ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+ if (ret) {
+ EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ goto free_sbi;
+ }
+
+ exofs_sysfs_dbg_print();
+ _exofs_print_device("Mounting", opts->dev_name,
+ ore_comp_dev(&sbi->oc, 0),
+ sbi->one_comp.obj.partition);
return 0;
free_sbi:
- bdi_destroy(&sbi->bdi);
-free_bdi:
EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
- opts->dev_name, sbi->layout.s_pid, ret);
+ opts->dev_name, sbi->one_comp.obj.partition, ret);
exofs_free_sbi(sbi);
return ret;
}
@@ -677,7 +871,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
if (ret)
return ERR_PTR(ret);
- opts.dev_name = dev_name;
+ if (!opts.dev_name)
+ opts.dev_name = dev_name;
return mount_nodev(type, flags, &opts, exofs_fill_super);
}
@@ -689,7 +884,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct exofs_sb_info *sbi = sb->s_fs_info;
- struct exofs_io_state *ios;
+ struct ore_io_state *ios;
struct osd_attr attrs[] = {
ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -698,21 +893,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
};
uint64_t capacity = ULLONG_MAX;
uint64_t used = ULLONG_MAX;
- uint8_t cred_a[OSD_CAP_LEN];
int ret;
- ret = exofs_get_io_state(&sbi->layout, &ios);
+ ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
if (ret) {
- EXOFS_DBGMSG("exofs_get_io_state failed.\n");
+ EXOFS_DBGMSG("ore_get_io_state failed.\n");
return ret;
}
- exofs_make_credential(cred_a, &ios->obj);
- ios->cred = sbi->s_cred;
ios->in_attr = attrs;
ios->in_attr_len = ARRAY_SIZE(attrs);
- ret = exofs_sbi_read(ios);
+ ret = ore_read(ios);
if (unlikely(ret))
goto out;
@@ -741,7 +933,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = EXOFS_NAME_LEN;
out:
- exofs_put_io_state(ios);
+ ore_put_io_state(ios);
return ret;
}
@@ -751,7 +943,6 @@ static const struct super_operations exofs_sops = {
.write_inode = exofs_write_inode,
.evict_inode = exofs_evict_inode,
.put_super = exofs_put_super,
- .write_super = exofs_write_super,
.sync_fs = exofs_sync_fs,
.statfs = exofs_statfs,
};
@@ -760,12 +951,12 @@ static const struct super_operations exofs_sops = {
* EXPORT OPERATIONS
*****************************************************************************/
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
{
unsigned long ino = exofs_parent_ino(child);
if (!ino)
- return NULL;
+ return ERR_PTR(-ESTALE);
return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
}
@@ -819,6 +1010,7 @@ static struct file_system_type exofs_type = {
.mount = exofs_mount,
.kill_sb = generic_shutdown_super,
};
+MODULE_ALIAS_FS("exofs");
static int __init init_exofs(void)
{
@@ -832,6 +1024,9 @@ static int __init init_exofs(void)
if (err)
goto out_d;
+ /* We don't fail if sysfs creation failed */
+ exofs_sysfs_init();
+
return 0;
out_d:
destroy_inodecache();
@@ -841,6 +1036,7 @@ out:
static void __exit exit_exofs(void)
{
+ exofs_sysfs_uninit();
unregister_filesystem(&exofs_type);
destroy_inodecache();
}
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
new file mode 100644
index 00000000000..1b4f2f95fc3
--- /dev/null
+++ b/fs/exofs/sys.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2012
+ * Sachin Bhamare <sbhamare@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License 2 as published by
+ * the Free Software Foundation.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the:
+ * Free Software Foundation <licensing@fsf.org>
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "exofs.h"
+
+struct odev_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct exofs_dev *, char *);
+ ssize_t (*store)(struct exofs_dev *, const char *, size_t);
+};
+
+static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+ struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+ return a->show ? a->show(edp, buf) : 0;
+}
+
+static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+ struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+ return a->store ? a->store(edp, buf, len) : len;
+}
+
+static const struct sysfs_ops odev_attr_ops = {
+ .show = odev_attr_show,
+ .store = odev_attr_store,
+};
+
+
+static struct kset *exofs_kset;
+
+static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
+{
+ struct osd_dev *odev = edp->ored.od;
+ const struct osd_dev_info *odi = osduld_device_info(odev);
+
+ return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
+}
+
+static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
+{
+ struct osd_dev *odev = edp->ored.od;
+ const struct osd_dev_info *odi = osduld_device_info(odev);
+
+ memcpy(buf, odi->systemid, odi->systemid_len);
+ return odi->systemid_len;
+}
+
+static ssize_t uri_show(struct exofs_dev *edp, char *buf)
+{
+ return snprintf(buf, edp->urilen, "%s", edp->uri);
+}
+
+static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
+{
+ uint8_t *new_uri;
+
+ edp->urilen = strlen(buf) + 1;
+ new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+ if (new_uri == NULL)
+ return -ENOMEM;
+ edp->uri = new_uri;
+ strncpy(edp->uri, buf, edp->urilen);
+ return edp->urilen;
+}
+
+#define OSD_ATTR(name, mode, show, store) \
+ static struct odev_attr odev_attr_##name = \
+ __ATTR(name, mode, show, store)
+
+OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
+OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
+OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
+
+static struct attribute *odev_attrs[] = {
+ &odev_attr_osdname.attr,
+ &odev_attr_systemid.attr,
+ &odev_attr_uri.attr,
+ NULL,
+};
+
+static struct kobj_type odev_ktype = {
+ .default_attrs = odev_attrs,
+ .sysfs_ops = &odev_attr_ops,
+};
+
+static struct kobj_type uuid_ktype = {
+};
+
+void exofs_sysfs_dbg_print(void)
+{
+#ifdef CONFIG_EXOFS_DEBUG
+ struct kobject *k_name, *k_tmp;
+
+ list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+ printk(KERN_INFO "%s: name %s ref %d\n",
+ __func__, kobject_name(k_name),
+ (int)atomic_read(&k_name->kref.refcount));
+ }
+#endif
+}
+/*
+ * This function removes all kobjects under exofs_kset
+ * At the end of it, exofs_kset kobject will have a refcount
+ * of 1 which gets decremented only on exofs module unload
+ */
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
+{
+ struct kobject *k_name, *k_tmp;
+ struct kobject *s_kobj = &sbi->s_kobj;
+
+ list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+ /* Remove all that are children of this SBI */
+ if (k_name->parent == s_kobj)
+ kobject_put(k_name);
+ }
+ kobject_put(s_kobj);
+}
+
+/*
+ * This function creates sysfs entries to hold the current exofs cluster
+ * instance (uniquely identified by osdname,pid tuple).
+ * This function gets called once per exofs mount instance.
+ */
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+ struct exofs_dt_device_info *dt_dev)
+{
+ struct kobject *s_kobj;
+ int retval = 0;
+ uint64_t pid = sbi->one_comp.obj.partition;
+
+ /* allocate new uuid dirent */
+ s_kobj = &sbi->s_kobj;
+ s_kobj->kset = exofs_kset;
+ retval = kobject_init_and_add(s_kobj, &uuid_ktype,
+ &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid);
+ if (retval) {
+ EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+ "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
+{
+ struct kobject *d_kobj;
+ int retval = 0;
+
+ /* create osd device group which contains following attributes
+ * osdname, systemid & uri
+ */
+ d_kobj = &edev->ed_kobj;
+ d_kobj->kset = exofs_kset;
+ retval = kobject_init_and_add(d_kobj, &odev_ktype,
+ &sbi->s_kobj, "dev%u", edev->did);
+ if (retval) {
+ EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+ "device dev%u\n", edev->did);
+ return retval;
+ }
+ return 0;
+}
+
+int exofs_sysfs_init(void)
+{
+ exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
+ if (!exofs_kset) {
+ EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void exofs_sysfs_uninit(void)
+{
+ kset_unregister(exofs_kset);
+}