aboutsummaryrefslogtreecommitdiff
path: root/fs/exofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/exofs')
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/Kconfig.ore2
-rw-r--r--fs/exofs/dir.c42
-rw-r--r--fs/exofs/exofs.h14
-rw-r--r--fs/exofs/file.c10
-rw-r--r--fs/exofs/inode.c92
-rw-r--r--fs/exofs/namei.c17
-rw-r--r--fs/exofs/ore.c174
-rw-r--r--fs/exofs/ore_raid.c151
-rw-r--r--fs/exofs/ore_raid.h21
-rw-r--r--fs/exofs/super.c50
-rw-r--r--fs/exofs/sys.c205
12 files changed, 520 insertions, 260 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 352ba149d23..389ba8312d5 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
libore-y := ore.o ore_raid.o
obj-$(CONFIG_ORE) += libore.o
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
index 1ca7fb7b6ba..2daf2329c28 100644
--- a/fs/exofs/Kconfig.ore
+++ b/fs/exofs/Kconfig.ore
@@ -9,4 +9,6 @@ config ORE
tristate
depends on EXOFS_FS || PNFS_OBJLAYOUT
select ASYNC_XOR
+ select RAID6_PQ
+ select ASYNC_PQ
default SCSI_OSD_ULD
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 80405836ba6..49f51ab4caa 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
}
static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
{
- loff_t pos = filp->f_pos;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ loff_t pos = ctx->pos;
+ struct inode *inode = file_inode(file);
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
- unsigned char *types = NULL;
- int need_revalidate = (filp->f_version != inode->i_version);
+ int need_revalidate = (file->f_version != inode->i_version);
if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
return 0;
- types = exofs_filetype_table;
-
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (IS_ERR(page)) {
EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
inode->i_ino);
- filp->f_pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_CACHE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
}
- filp->f_version = inode->i_version;
+ file->f_version = inode->i_version;
need_revalidate = 0;
}
de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
return -EIO;
}
if (de->inode_no) {
- int over;
- unsigned char d_type = DT_UNKNOWN;
+ unsigned char t;
- if (types && de->file_type < EXOFS_FT_MAX)
- d_type = types[de->file_type];
+ if (de->file_type < EXOFS_FT_MAX)
+ t = exofs_filetype_table[de->file_type];
+ else
+ t = DT_UNKNOWN;
- offset = (char *)de - kaddr;
- over = filldir(dirent, de->name, de->name_len,
- (n<<PAGE_CACHE_SHIFT) | offset,
+ if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode_no),
- d_type);
- if (over) {
+ t)) {
exofs_put_page(page);
return 0;
}
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
exofs_put_page(page);
}
-
return 0;
}
@@ -597,7 +591,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
goto fail;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
de = (struct exofs_dir_entry *)kaddr;
de->name_len = 1;
de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
@@ -611,7 +605,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
de->inode_no = cpu_to_le64(parent->i_ino);
memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
exofs_set_de_type(de, inode);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
page_cache_release(page);
@@ -669,5 +663,5 @@ not_empty:
const struct file_operations exofs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = exofs_readdir,
+ .iterate = exofs_readdir,
};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ca9d49665ef..fffe86fd7a4 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -56,6 +56,9 @@
struct exofs_dev {
struct ore_dev ored;
unsigned did;
+ unsigned urilen;
+ uint8_t *uri;
+ struct kobject ed_kobj;
};
/*
* our extension to the in-memory superblock
@@ -73,6 +76,7 @@ struct exofs_sb_info {
struct ore_layout layout; /* Default files layout */
struct ore_comp one_comp; /* id & cred of partition id=0*/
struct ore_components oc; /* comps for the partition */
+ struct kobject s_kobj; /* holds per-sbi kobject */
};
/*
@@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
const struct osd_obj_id *obj);
int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
+/* sys.c */
+int exofs_sysfs_init(void);
+void exofs_sysfs_uninit(void);
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+ struct exofs_dt_device_info *dt_dev);
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
+int exofs_sysfs_odev_add(struct exofs_dev *edev,
+ struct exofs_sb_info *sbi);
+void exofs_sysfs_dbg_print(void);
+
/*********************
* operation vectors *
*********************/
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 491c6c078e7..71bf8e4fb5d 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
const struct file_operations exofs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = exofs_release_file,
.fsync = exofs_file_fsync,
.flush = exofs_flush,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ea5e1f97806..3f9cafd7393 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,12 @@
#define EXOFS_DBGMSG2(M...) do {} while (0)
-enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
-
unsigned exofs_max_io_pages(struct ore_layout *layout,
unsigned expected_pages)
{
- unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+ unsigned pages = min_t(unsigned, expected_pages,
+ layout->max_io_length / PAGE_SIZE);
- /* TODO: easily support bio chaining */
- pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
return pages;
}
@@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol)
* it might not end here. don't be left with nothing
*/
if (!pcol->expected_pages)
- pcol->expected_pages = MAX_PAGES_KMALLOC;
+ pcol->expected_pages =
+ exofs_max_io_pages(&pcol->sbi->layout, ~0);
}
static int pcol_try_alloc(struct page_collect *pcol)
@@ -363,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
return 0;
err:
- if (!pcol->read_4_write)
- _unlock_pcol_pages(pcol, ret, READ);
-
- pcol_free(pcol);
-
+ if (!pcol_copy) /* Failed before ownership transfer */
+ pcol_copy = pcol;
+ _unlock_pcol_pages(pcol_copy, ret, READ);
+ pcol_free(pcol_copy);
kfree(pcol_copy);
+
return ret;
}
@@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page)
size_t len;
int ret;
+ BUG_ON(!PageLocked(page));
+
/* FIXME: Just for debugging, will be removed */
if (PageUptodate(page))
EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
@@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
if (!pcol->that_locked_page ||
(pcol->that_locked_page->index != index)) {
- struct page *page = find_get_page(pcol->inode->i_mapping, index);
+ struct page *page;
+ loff_t i_size = i_size_read(pcol->inode);
+ if (offset >= i_size) {
+ *uptodate = true;
+ EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
+ return ZERO_PAGE(0);
+ }
+
+ page = find_get_page(pcol->inode->i_mapping, index);
if (!page) {
page = find_or_create_page(pcol->inode->i_mapping,
index, GFP_NOFS);
@@ -588,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
*uptodate = true;
else
*uptodate = PageUptodate(page);
- EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+ EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
- EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+ EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
pcol->that_locked_page->index);
*uptodate = true;
return pcol->that_locked_page;
@@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page)
{
struct page_collect *pcol = priv;
- if (pcol->that_locked_page != page) {
- EXOFS_DBGMSG("index=0x%lx\n", page->index);
+ if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
+ EXOFS_DBGMSG2("index=0x%lx\n", page->index);
page_cache_release(page);
return;
}
- EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+ EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
+ ZERO_PAGE(0) == page ? -1 : page->index);
}
static const struct _ore_r4w_op _r4w_op = {
@@ -667,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
return 0;
err:
- _unlock_pcol_pages(pcol, ret, WRITE);
- pcol_free(pcol);
+ if (!pcol_copy) /* Failed before ownership transfer */
+ pcol_copy = pcol;
+ _unlock_pcol_pages(pcol_copy, ret, WRITE);
+ pcol_free(pcol_copy);
kfree(pcol_copy);
return ret;
@@ -850,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
static void _write_failed(struct inode *inode, loff_t to)
{
if (to > inode->i_size)
- truncate_pagecache(inode, to, inode->i_size);
+ truncate_pagecache(inode, inode->i_size);
}
int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -942,12 +953,22 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
return 0;
}
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
{
- EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+ EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+ page->index, offset, length);
WARN_ON(1);
}
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ return 0;
+}
+
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
@@ -961,7 +982,7 @@ const struct address_space_operations exofs_aops = {
/* Not implemented Yet */
.bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = NULL, /* TODO: Should be trivial to do */
+ .direct_IO = exofs_direct_IO,
/* With these NULL has special meaning or default is not exported */
.get_xip_mem = NULL,
@@ -997,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
if (likely(!ret))
truncate_setsize(inode, newsize);
- EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+ EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
inode->i_ino, newsize, ret);
return ret;
}
@@ -1081,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
/* If object is lost on target we might as well enable it's
* delete.
*/
- if ((ret == -ENOENT) || (ret == -EINVAL))
- ret = 0;
+ ret = 0;
goto out;
}
ret = extract_attr_from_ios(ios, &attrs[0]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
goto out;
}
WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1096,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[1]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
goto out;
}
if (attrs[1].len) {
@@ -1111,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[2]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
goto out;
}
if (attrs[2].len) {
@@ -1163,8 +1183,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
/* copy stuff from on-disk struct to in-memory struct */
inode->i_mode = le16_to_cpu(fcb.i_mode);
- inode->i_uid = le32_to_cpu(fcb.i_uid);
- inode->i_gid = le32_to_cpu(fcb.i_gid);
+ i_uid_write(inode, le32_to_cpu(fcb.i_uid));
+ i_gid_write(inode, le32_to_cpu(fcb.i_gid));
set_nlink(inode, le16_to_cpu(fcb.i_links_count));
inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
@@ -1376,8 +1396,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
fcb = &args->fcb;
fcb->i_mode = cpu_to_le16(inode->i_mode);
- fcb->i_uid = cpu_to_le32(inode->i_uid);
- fcb->i_gid = cpu_to_le32(inode->i_gid);
+ fcb->i_uid = cpu_to_le32(i_uid_read(inode));
+ fcb->i_gid = cpu_to_le32(i_gid_read(inode));
fcb->i_links_count = cpu_to_le16(inode->i_nlink);
fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
@@ -1466,14 +1486,14 @@ void exofs_evict_inode(struct inode *inode)
struct ore_io_state *ios;
int ret;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
/* TODO: should do better here */
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
inode->i_size = 0;
- end_writeback(inode);
+ clear_inode(inode);
/* if we are deleting an obj that hasn't been created yet, wait.
* This also makes sure that create_done cannot be called with an
@@ -1503,5 +1523,5 @@ void exofs_evict_inode(struct inode *inode)
return;
no_delete:
- end_writeback(inode);
+ clear_inode(inode);
}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 9dbf0c30103..4731fd991ef 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
}
static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct inode *inode;
ino_t ino;
@@ -60,7 +60,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
}
static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
struct inode *inode = exofs_new_inode(dir, mode);
int err = PTR_ERR(inode);
@@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
{
struct inode *inode = old_dentry->d_inode;
- if (inode->i_nlink >= EXOFS_LINK_MAX)
- return -EMLINK;
-
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
ihold(inode);
@@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- int err = -EMLINK;
-
- if (dir->i_nlink >= EXOFS_LINK_MAX)
- goto out;
+ int err;
inode_inc_link_count(dir);
@@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto out_dir;
} else {
- if (dir_de) {
- err = -EMLINK;
- if (new_dir->i_nlink >= EXOFS_LINK_MAX)
- goto out_dir;
- }
err = exofs_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 49cf230554a..cfc0205d62c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->parity = 1;
break;
case PNFS_OSD_RAID_PQ:
+ layout->parity = 2;
+ break;
case PNFS_OSD_RAID_4:
default:
- ORE_ERR("Only RAID_0/5 for now\n");
+ ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+ layout->raid_algorithm);
return -EINVAL;
}
if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
@@ -103,7 +106,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->max_io_length =
(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
- layout->group_width;
+ (layout->group_width - layout->parity);
if (layout->parity) {
unsigned stripe_length =
(layout->group_width - layout->parity) *
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->max_io_length /= stripe_length;
layout->max_io_length *= stripe_length;
}
+ ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
return 0;
}
EXPORT_SYMBOL(ore_verify_layout);
@@ -286,7 +291,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
if (length) {
ore_calc_stripe_info(layout, offset, length, &ios->si);
ios->length = ios->si.length;
- ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+ ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
if (layout->parity)
_ore_post_alloc_raid_stuff(ios);
}
@@ -401,7 +407,7 @@ static void _clear_bio(struct bio *bio)
struct bio_vec *bv;
unsigned i;
- __bio_for_each_segment(bv, bio, i, 0) {
+ bio_for_each_segment_all(bv, bio, i) {
unsigned this_count = bv->bv_len;
if (likely(PAGE_SIZE == this_count))
@@ -430,8 +436,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
if (likely(!ret))
continue;
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
+ if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+ per_dev->bio) {
+ /* start read offset passed endof file.
+ * Note: if we do not have bio it means read-attributes
+ * In this case we should return error to caller.
+ */
_clear_bio(per_dev->bio);
ORE_DBGMSG("start read offset passed end of file "
"offset=0x%llx, length=0x%llx\n",
@@ -536,24 +546,28 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
u64 H = LmodS - G * T;
u32 N = div_u64(H, U);
+ u32 Nlast;
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ u32 first_dev = C - C % group_width;
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
+ si->cur_comp = C - first_dev;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
if (parity) {
u32 LCMdP = lcm(group_width, parity) / parity;
/* R = N % LCMdP; */
u32 RxP = (N % LCMdP) * parity;
- u32 first_dev = C - C % group_width;
si->par_dev = (group_width + group_width - parity - RxP) %
group_width + first_dev;
- si->dev = (group_width + C - RxP) % group_width + first_dev;
+ si->dev = (group_width + group_width + C - RxP) %
+ group_width + first_dev;
si->bytes_in_stripe = U;
si->first_stripe_start = M * S + G * T + N * U;
} else {
@@ -568,6 +582,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
si->length = T - H;
if (si->length > length)
si->length = length;
+
+ Nlast = div_u64(H + si->length + U - 1, U);
+ si->maxdevUnits = Nlast - N;
+
si->M = M;
}
EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +601,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
int ret;
if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
- (ios->layout->group_width -
- ios->layout->parity);
- unsigned bio_size = (nr_pages + pages_in_stripe) /
- ios->layout->group_width;
+ unsigned bio_size;
+
+ if (!ios->reading) {
+ bio_size = ios->si.maxdevUnits;
+ } else {
+ bio_size = (ios->si.maxdevUnits + 1) *
+ (ios->layout->group_width - ios->layout->parity) /
+ ios->layout->group_width;
+ }
+ bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
if (unlikely(!per_dev->bio)) {
@@ -609,8 +630,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
pglen, pgbase);
if (unlikely(pglen != added_len)) {
- ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
- per_dev->bio->bi_vcnt);
+ /* If bi_vcnt == bi_max then this is a SW BUG */
+ ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+ "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+ per_dev->bio->bi_vcnt,
+ per_dev->bio->bi_max_vecs,
+ BIO_MAX_PAGES_KMALLOC, cur_len);
ret = -ENOMEM;
goto out;
}
@@ -632,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance
return ret;
}
+static int _add_parity_units(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ unsigned dev, unsigned first_dev,
+ unsigned mirrors_p1, unsigned devs_in_group,
+ unsigned cur_len)
+{
+ unsigned do_parity;
+ int ret = 0;
+
+ for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+ struct ore_per_dev_state *per_dev;
+
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length && !per_dev->offset) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
+
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+ do_parity == 1);
+ if (unlikely(ret))
+ break;
+
+ if (do_parity != 1) {
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+ si->cur_comp = (si->cur_comp + 1) %
+ ios->layout->group_width;
+ }
+ }
+
+ return ret;
+}
+
static int _prepare_for_striping(struct ore_io_state *ios)
{
struct ore_striping_info *si = &ios->si;
@@ -641,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
unsigned devs_in_group = group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
- unsigned dev_order;
unsigned cur_pg = ios->pages_consumed;
u64 length = ios->length;
int ret = 0;
@@ -653,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
BUG_ON(length > si->length);
- dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
- si->cur_comp = dev_order;
- si->cur_pg = si->unit_off / PAGE_SIZE;
-
while (length) {
- unsigned comp = dev - first_dev;
- struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
+ struct ore_per_dev_state *per_dev =
+ &ios->per_dev[dev - first_dev];
unsigned cur_len, page_off = 0;
- if (!per_dev->length) {
+ if (!per_dev->length && !per_dev->offset) {
+ /* First time initialize the per_dev info. */
per_dev->dev = dev;
if (dev == si->dev) {
WARN_ON(dev == si->par_dev);
@@ -671,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off && (page_off != ios->pgbase));
} else {
- if (si->cur_comp > dev_order)
- per_dev->offset =
- si->obj_offset - si->unit_off;
- else /* si->cur_comp < dev_order */
- per_dev->offset =
- si->obj_offset + stripe_unit -
- si->unit_off;
+ per_dev->offset = si->obj_offset - si->unit_off;
cur_len = stripe_unit;
}
} else {
@@ -691,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios)
if (unlikely(ret))
goto out;
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
length -= cur_len;
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
si->cur_comp = (si->cur_comp + 1) % group_width;
if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
if (!length && ios->sp2d) {
@@ -703,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios)
* stripe. then operate on parity dev.
*/
dev = si->par_dev;
- }
- if (ios->sp2d)
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- cur_len = length;
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
+ /* If last stripe operate on parity comp */
+ si->cur_comp = group_width - ios->layout->parity;
}
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ ret = _add_parity_units(ios, si, dev, first_dev,
+ mirrors_p1, devs_in_group,
+ ios->sp2d ? length : cur_len);
if (unlikely(ret))
goto out;
@@ -730,18 +773,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
/* Next stripe, start fresh */
si->cur_comp = 0;
si->cur_pg = 0;
+ si->obj_offset += cur_len;
+ si->unit_off = 0;
}
}
out:
ios->numdevs = devs_in_group;
ios->pages_consumed = cur_pg;
- if (unlikely(ret)) {
- if (length == ios->length)
- return ret;
- else
- ios->length -= length;
- }
- return 0;
+ return ret;
}
int ore_create(struct ore_io_state *ios)
@@ -820,8 +859,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct bio *bio;
if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_KERNEL,
- master_dev->bio->bi_max_vecs);
+ bio = bio_clone_kmalloc(master_dev->bio,
+ GFP_KERNEL);
if (unlikely(!bio)) {
ORE_DBGMSG(
"Failed to allocate BIO size=%u\n",
@@ -830,7 +869,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
- __bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
per_dev->offset = master_dev->offset;
@@ -843,11 +881,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
bio->bi_rw |= REQ_WRITE;
}
- osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
- bio, per_dev->length);
+ osd_req_write(or, _ios_obj(ios, cur_comp),
+ per_dev->offset, bio, per_dev->length);
ORE_DBGMSG("write(0x%llx) offset=0x%llx "
"length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, dev)->id),
+ _LLU(_ios_obj(ios, cur_comp)->id),
_LLU(per_dev->offset),
_LLU(per_dev->length), dev);
} else if (ios->kern_buff) {
@@ -859,20 +897,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
(ios->si.unit_off + ios->length >
ios->layout->stripe_unit));
- ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
+ ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
per_dev->offset,
ios->kern_buff, ios->length);
if (unlikely(ret))
goto out;
ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
"length=0x%llx dev=%d\n",
- _LLU(_ios_obj(ios, dev)->id),
+ _LLU(_ios_obj(ios, cur_comp)->id),
_LLU(per_dev->offset),
_LLU(ios->length), per_dev->dev);
} else {
- osd_req_set_attributes(or, _ios_obj(ios, dev));
+ osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
- _LLU(_ios_obj(ios, dev)->id),
+ _LLU(_ios_obj(ios, cur_comp)->id),
ios->out_attr_len, dev);
}
@@ -1105,7 +1143,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
size_attr->attr = g_attr_logical_length;
size_attr->attr.val_ptr = &size_attr->newsize;
- ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+ ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
_LLU(oc->comps->obj.id), _LLU(obj_size), i);
ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
&size_attr->attr);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index d222c77cfa1..7f20f25c232 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
#undef ORE_DBGMSG2
#define ORE_DBGMSG2 ORE_DBGMSG
-struct page *_raid_page_alloc(void)
+static struct page *_raid_page_alloc(void)
{
return alloc_page(GFP_KERNEL);
}
-void _raid_page_free(struct page *p)
+static void _raid_page_free(struct page *p)
{
__free_page(p);
}
@@ -144,26 +144,26 @@ static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
{
unsigned data_devs = sp2d->data_devs;
unsigned group_width = data_devs + sp2d->parity;
- unsigned p;
+ int p, c;
if (!sp2d->needed)
return;
- for (p = 0; p < sp2d->pages_in_unit; p++) {
- struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
- if (_1ps->write_count < group_width) {
- unsigned c;
+ for (c = data_devs - 1; c >= 0; --c)
+ for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
- for (c = 0; c < data_devs; c++)
- if (_1ps->page_is_read[c]) {
- struct page *page = _1ps->pages[c];
+ if (_1ps->page_is_read[c]) {
+ struct page *page = _1ps->pages[c];
- r4w->put_page(priv, page);
- _1ps->page_is_read[c] = false;
- }
+ r4w->put_page(priv, page);
+ _1ps->page_is_read[c] = false;
+ }
}
+ for (p = 0; p < sp2d->pages_in_unit; p++) {
+ struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
_1ps->write_count = 0;
_1ps->tx = NULL;
@@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
{
- unsigned p;
+ int p;
for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
@@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
{
unsigned p;
+ unsigned tx_flags = ASYNC_TX_ACK;
+
+ if (sp2d->parity == 1)
+ tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
for (p = 0; p < sp2d->pages_in_unit; p++) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
if (!_1ps->write_count)
continue;
- init_async_submit(&_1ps->submit,
- ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
- NULL,
- NULL, NULL,
- (addr_conv_t *)_1ps->scribble);
-
- /* TODO: raid6 */
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
- 0, sp2d->data_devs, PAGE_SIZE,
- &_1ps->submit);
+ init_async_submit(&_1ps->submit, tx_flags,
+ NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
+
+ if (sp2d->parity == 1)
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+ _1ps->pages, 0, sp2d->data_devs,
+ PAGE_SIZE, &_1ps->submit);
+ else /* parity == 2 */
+ _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+ sp2d->data_devs + sp2d->parity,
+ PAGE_SIZE, &_1ps->submit);
}
for (p = 0; p < sp2d->pages_in_unit; p++) {
@@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
ore_calc_stripe_info(ios->layout, *offset, 0, &si);
- p = si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, si.par_dev, si.dev);
+ p = si.cur_pg;
+ c = si.cur_comp;
page = ios->sp2d->_1p_stripes[p].pages[c];
pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
@@ -432,7 +437,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
if (!bio)
continue;
- __bio_for_each_segment(bv, bio, i, 0) {
+ bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
SetPageUptodate(page);
@@ -461,16 +466,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
* ios->sp2d[p][*], xor is calculated the same way. These pages are
* allocated/freed and don't go through cache
*/
-static int _read_4_write(struct ore_io_state *ios)
+static int _read_4_write_first_stripe(struct ore_io_state *ios)
{
- struct ore_io_state *ios_read;
struct ore_striping_info read_si;
struct __stripe_pages_2d *sp2d = ios->sp2d;
u64 offset = ios->si.first_stripe_start;
- u64 last_stripe_end;
- unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
- unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
- int ret;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
if (offset == ios->offset) /* Go to start collect $200 */
goto read_last_stripe;
@@ -478,6 +479,9 @@ static int _read_4_write(struct ore_io_state *ios)
min_p = _sp2d_min_pg(sp2d);
max_p = _sp2d_max_pg(sp2d);
+ ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
+ offset, ios->offset, min_p, max_p);
+
for (c = 0; ; c++) {
ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
read_si.obj_offset += min_p * PAGE_SIZE;
@@ -512,6 +516,18 @@ static int _read_4_write(struct ore_io_state *ios)
}
read_last_stripe:
+ return 0;
+}
+
+static int _read_4_write_last_stripe(struct ore_io_state *ios)
+{
+ struct ore_striping_info read_si;
+ struct __stripe_pages_2d *sp2d = ios->sp2d;
+ u64 offset;
+ u64 last_stripe_end;
+ unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
offset = ios->offset + ios->length;
if (offset % PAGE_SIZE)
_add_to_r4w_last_page(ios, &offset);
@@ -523,12 +539,8 @@ read_last_stripe:
goto read_it;
ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
-
- BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
- /* unaligned IO must be within a single stripe */
+ p = read_si.cur_pg;
+ c = read_si.cur_comp;
if (min_p == sp2d->pages_in_unit) {
/* Didn't do it yet */
@@ -536,6 +548,9 @@ read_last_stripe:
max_p = _sp2d_max_pg(sp2d);
}
+ ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
+ offset, last_stripe_end, min_p, max_p);
+
while (offset < last_stripe_end) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
@@ -568,6 +583,15 @@ read_last_stripe:
}
read_it:
+ return 0;
+}
+
+static int _read_4_write_execute(struct ore_io_state *ios)
+{
+ struct ore_io_state *ios_read;
+ unsigned i;
+ int ret;
+
ios_read = ios->ios_read_4_write;
if (!ios_read)
return 0;
@@ -591,6 +615,8 @@ read_it:
}
_mark_read4write_pages_uptodate(ios_read, ret);
+ ore_put_io_state(ios_read);
+ ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
return 0;
}
@@ -598,7 +624,7 @@ read_it:
int _ore_add_parity_unit(struct ore_io_state *ios,
struct ore_striping_info *si,
struct ore_per_dev_state *per_dev,
- unsigned cur_len)
+ unsigned cur_len, bool do_xor)
{
if (ios->reading) {
if (per_dev->cur_sg >= ios->sgs_per_dev) {
@@ -618,16 +644,18 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
si->cur_pg = _sp2d_min_pg(sp2d);
num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
- if (!cur_len) /* If last stripe operate on parity comp */
- si->cur_comp = sp2d->data_devs;
-
if (!per_dev->length) {
per_dev->offset += si->cur_pg * PAGE_SIZE;
/* If first stripe, Read in all read4write pages
* (if needed) before we calculate the first parity.
*/
- _read_4_write(ios);
+ if (do_xor)
+ _read_4_write_first_stripe(ios);
}
+ if (!cur_len && do_xor)
+ /* If last stripe r4w pages of last stripe */
+ _read_4_write_last_stripe(ios);
+ _read_4_write_execute(ios);
for (i = 0; i < num_pages; i++) {
pages[i] = _raid_page_alloc();
@@ -637,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
++(ios->cur_par_page);
}
- BUG_ON(si->cur_comp != sp2d->data_devs);
+ BUG_ON(si->cur_comp < sp2d->data_devs);
BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
@@ -645,43 +673,24 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
if (unlikely(ret))
return ret;
- /* TODO: raid6 if (last_parity_dev) */
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
+ if (do_xor) {
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
}
return 0;
}
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
{
- struct ore_layout *layout = ios->layout;
-
if (ios->parity_pages) {
+ struct ore_layout *layout = ios->layout;
unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
- unsigned stripe_size = ios->si.bytes_in_stripe;
- u64 last_stripe, first_stripe;
if (_sp2d_alloc(pages_in_unit, layout->group_width,
layout->parity, &ios->sp2d)) {
return -ENOMEM;
}
-
- /* Round io down to last full strip */
- first_stripe = div_u64(ios->offset, stripe_size);
- last_stripe = div_u64(ios->offset + ios->length, stripe_size);
-
- /* If an IO spans more then a single stripe it must end at
- * a stripe boundary. The reminder at the end is pushed into the
- * next IO.
- */
- if (last_stripe != first_stripe) {
- ios->length = last_stripe * stripe_size - ios->offset;
-
- BUG_ON(!ios->length);
- ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
- PAGE_SIZE;
- ios->si.length = ios->length; /*make it consistent */
- }
}
return 0;
}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index 2ffd2c3c6e4..cf6375d8212 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -31,24 +31,6 @@
#define ORE_DBGMSG2(M...) do {} while (0)
/* #define ORE_DBGMSG2 ORE_DBGMSG */
-/* Calculate the component order in a stripe. eg the logical data unit
- * address within the stripe of @dev given the @par_dev of this stripe.
- */
-static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
- unsigned par_dev, unsigned dev)
-{
- unsigned first_dev = dev - dev % devs_in_group;
-
- dev -= first_dev;
- par_dev -= first_dev;
-
- if (devs_in_group == par_dev) /* The raid 0 case */
- return dev / mirrors_p1;
- /* raid4/5/6 case */
- return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
- mirrors_p1;
-}
-
/* ios_raid.c stuff needed by ios.c */
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
void _ore_free_raid_stuff(struct ore_io_state *ios);
@@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
bool not_last);
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len);
+ struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool do_xor);
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
struct ore_striping_info *si, struct page *page);
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index d22cd168c6e..ed73ed8ebbe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
*/
static void destroy_inodecache(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(exofs_inode_cachep);
}
@@ -384,12 +389,10 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
if (unlikely(ret))
goto out;
- lock_super(sb);
-
ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
memset(fscb, 0, ios->length);
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
- fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+ fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
fscb->s_magic = cpu_to_le16(sb->s_magic);
fscb->s_newfs = 0;
fscb->s_version = EXOFS_FSCB_VER;
@@ -400,11 +403,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
ret = ore_write(ios);
if (unlikely(ret))
EXOFS_ERR("%s: ore_write failed.\n", __func__);
- else
- sb->s_dirt = 0;
-
- unlock_super(sb);
out:
EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
ore_put_io_state(ios);
@@ -412,14 +411,6 @@ out:
return ret;
}
-static void exofs_write_super(struct super_block *sb)
-{
- if (!(sb->s_flags & MS_RDONLY))
- exofs_sync_fs(sb, 1);
- else
- sb->s_dirt = 0;
-}
-
static void _exofs_print_device(const char *msg, const char *dev_path,
struct osd_dev *od, u64 pid)
{
@@ -472,6 +463,7 @@ static void exofs_put_super(struct super_block *sb)
_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
+ exofs_sysfs_sb_del(sbi);
bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
@@ -529,7 +521,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
struct osd_dev_info *odi)
{
odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
- memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+ if (likely(odi->systemid_len))
+ memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
odi->osdname = dt_dev->osdname;
@@ -550,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
-int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_dev **peds)
{
struct __alloc_ore_devs_and_exofs_devs {
@@ -565,7 +558,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
if (unlikely(!aoded)) {
- EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+ EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
numdevs);
return -ENOMEM;
}
@@ -631,6 +624,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
(numdevs - 1) * sizeof(sbi->oc.ods[0]));
+ /* create sysfs subdir under which we put the device table
+ * And cluster layout. A Superblock is identified by the string:
+ * "dev[0].osdname"_"pid"
+ */
+ exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
+
for (i = 0; i < numdevs; i++) {
struct exofs_fscb fscb;
struct osd_dev_info odi;
@@ -656,6 +655,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
eds[i].ored.od = fscb_od;
++sbi->oc.numdevs;
fscb_od = NULL;
+ exofs_sysfs_odev_add(&eds[i], sbi);
continue;
}
@@ -681,6 +681,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
odi.osdname);
goto out;
}
+ exofs_sysfs_odev_add(&eds[i], sbi);
/* TODO: verify other information is correct and FS-uuid
* matches. Benny what did you say about device table
@@ -744,7 +745,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sbi->one_comp.obj.partition = opts->pid;
sbi->one_comp.obj.id = 0;
exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
- sbi->oc.numdevs = 1;
sbi->oc.single_comp = EC_SINGLE_COMP;
sbi->oc.comps = &sbi->one_comp;
@@ -754,6 +754,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_blocksize = EXOFS_BLKSIZE;
sb->s_blocksize_bits = EXOFS_BLKSHIFT;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_max_links = EXOFS_LINK_MAX;
atomic_set(&sbi->s_curr_pending, 0);
sb->s_bdev = NULL;
sb->s_dev = 0;
@@ -802,6 +803,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
ore_comp_set_dev(&sbi->oc, 0, od);
+ sbi->oc.numdevs = 1;
}
__sbi_read_stats(sbi);
@@ -818,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
ret = PTR_ERR(root);
goto free_sbi;
}
- sb->s_root = d_alloc_root(root);
+ sb->s_root = d_make_root(root);
if (!sb->s_root) {
- iput(root);
EXOFS_ERR("ERROR: get root inode failed\n");
ret = -ENOMEM;
goto free_sbi;
@@ -843,6 +844,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
+ exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(&sbi->oc, 0),
sbi->one_comp.obj.partition);
@@ -941,7 +943,6 @@ static const struct super_operations exofs_sops = {
.write_inode = exofs_write_inode,
.evict_inode = exofs_evict_inode,
.put_super = exofs_put_super,
- .write_super = exofs_write_super,
.sync_fs = exofs_sync_fs,
.statfs = exofs_statfs,
};
@@ -1009,6 +1010,7 @@ static struct file_system_type exofs_type = {
.mount = exofs_mount,
.kill_sb = generic_shutdown_super,
};
+MODULE_ALIAS_FS("exofs");
static int __init init_exofs(void)
{
@@ -1022,6 +1024,9 @@ static int __init init_exofs(void)
if (err)
goto out_d;
+ /* We don't fail if sysfs creation failed */
+ exofs_sysfs_init();
+
return 0;
out_d:
destroy_inodecache();
@@ -1031,6 +1036,7 @@ out:
static void __exit exit_exofs(void)
{
+ exofs_sysfs_uninit();
unregister_filesystem(&exofs_type);
destroy_inodecache();
}
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
new file mode 100644
index 00000000000..1b4f2f95fc3
--- /dev/null
+++ b/fs/exofs/sys.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2012
+ * Sachin Bhamare <sbhamare@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License 2 as published by
+ * the Free Software Foundation.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the:
+ * Free Software Foundation <licensing@fsf.org>
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "exofs.h"
+
+struct odev_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct exofs_dev *, char *);
+ ssize_t (*store)(struct exofs_dev *, const char *, size_t);
+};
+
+static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+ struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+ return a->show ? a->show(edp, buf) : 0;
+}
+
+static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+ struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+ return a->store ? a->store(edp, buf, len) : len;
+}
+
+static const struct sysfs_ops odev_attr_ops = {
+ .show = odev_attr_show,
+ .store = odev_attr_store,
+};
+
+
+static struct kset *exofs_kset;
+
+static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
+{
+ struct osd_dev *odev = edp->ored.od;
+ const struct osd_dev_info *odi = osduld_device_info(odev);
+
+ return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
+}
+
+static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
+{
+ struct osd_dev *odev = edp->ored.od;
+ const struct osd_dev_info *odi = osduld_device_info(odev);
+
+ memcpy(buf, odi->systemid, odi->systemid_len);
+ return odi->systemid_len;
+}
+
+static ssize_t uri_show(struct exofs_dev *edp, char *buf)
+{
+ return snprintf(buf, edp->urilen, "%s", edp->uri);
+}
+
+static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
+{
+ uint8_t *new_uri;
+
+ edp->urilen = strlen(buf) + 1;
+ new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+ if (new_uri == NULL)
+ return -ENOMEM;
+ edp->uri = new_uri;
+ strncpy(edp->uri, buf, edp->urilen);
+ return edp->urilen;
+}
+
+#define OSD_ATTR(name, mode, show, store) \
+ static struct odev_attr odev_attr_##name = \
+ __ATTR(name, mode, show, store)
+
+OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
+OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
+OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
+
+static struct attribute *odev_attrs[] = {
+ &odev_attr_osdname.attr,
+ &odev_attr_systemid.attr,
+ &odev_attr_uri.attr,
+ NULL,
+};
+
+static struct kobj_type odev_ktype = {
+ .default_attrs = odev_attrs,
+ .sysfs_ops = &odev_attr_ops,
+};
+
+static struct kobj_type uuid_ktype = {
+};
+
+void exofs_sysfs_dbg_print(void)
+{
+#ifdef CONFIG_EXOFS_DEBUG
+ struct kobject *k_name, *k_tmp;
+
+ list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+ printk(KERN_INFO "%s: name %s ref %d\n",
+ __func__, kobject_name(k_name),
+ (int)atomic_read(&k_name->kref.refcount));
+ }
+#endif
+}
+/*
+ * This function removes all kobjects under exofs_kset
+ * At the end of it, exofs_kset kobject will have a refcount
+ * of 1 which gets decremented only on exofs module unload
+ */
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
+{
+ struct kobject *k_name, *k_tmp;
+ struct kobject *s_kobj = &sbi->s_kobj;
+
+ list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+ /* Remove all that are children of this SBI */
+ if (k_name->parent == s_kobj)
+ kobject_put(k_name);
+ }
+ kobject_put(s_kobj);
+}
+
+/*
+ * This function creates sysfs entries to hold the current exofs cluster
+ * instance (uniquely identified by osdname,pid tuple).
+ * This function gets called once per exofs mount instance.
+ */
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+ struct exofs_dt_device_info *dt_dev)
+{
+ struct kobject *s_kobj;
+ int retval = 0;
+ uint64_t pid = sbi->one_comp.obj.partition;
+
+ /* allocate new uuid dirent */
+ s_kobj = &sbi->s_kobj;
+ s_kobj->kset = exofs_kset;
+ retval = kobject_init_and_add(s_kobj, &uuid_ktype,
+ &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid);
+ if (retval) {
+ EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+ "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
+{
+ struct kobject *d_kobj;
+ int retval = 0;
+
+ /* create osd device group which contains following attributes
+ * osdname, systemid & uri
+ */
+ d_kobj = &edev->ed_kobj;
+ d_kobj->kset = exofs_kset;
+ retval = kobject_init_and_add(d_kobj, &odev_ktype,
+ &sbi->s_kobj, "dev%u", edev->did);
+ if (retval) {
+ EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+ "device dev%u\n", edev->did);
+ return retval;
+ }
+ return 0;
+}
+
+int exofs_sysfs_init(void)
+{
+ exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
+ if (!exofs_kset) {
+ EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void exofs_sysfs_uninit(void)
+{
+ kset_unregister(exofs_kset);
+}