12 files changed, 520 insertions, 260 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 352ba149d23..389ba8312d5 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
 libore-y := ore.o ore_raid.o
 obj-$(CONFIG_ORE) += libore.o
 
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
index 1ca7fb7b6ba..2daf2329c28 100644
--- a/fs/exofs/Kconfig.ore
+++ b/fs/exofs/Kconfig.ore
@@ -9,4 +9,6 @@ config ORE
 	tristate
 	depends on EXOFS_FS || PNFS_OBJLAYOUT
 	select ASYNC_XOR
+	select RAID6_PQ
+	select ASYNC_PQ
 	default SCSI_OSD_ULD
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 80405836ba6..49f51ab4caa 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
 }
 
 static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-	unsigned char *types = NULL;
-	int need_revalidate = (filp->f_version != inode->i_version);
+	int need_revalidate = (file->f_version != inode->i_version);
 
 	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
 		return 0;
 
-	types = exofs_filetype_table;
-
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
 		struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (IS_ERR(page)) {
 			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
 				  inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			if (offset) {
 				offset = exofs_validate_entry(kaddr, offset,
 								chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				return -EIO;
 			}
 			if (de->inode_no) {
-				int over;
-				unsigned char d_type = DT_UNKNOWN;
+				unsigned char t;
 
-				if (types && de->file_type < EXOFS_FT_MAX)
-					d_type = types[de->file_type];
+				if (de->file_type < EXOFS_FT_MAX)
+					t = exofs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
+				if (!dir_emit(ctx, de->name, de->name_len,
 						le64_to_cpu(de->inode_no),
-						d_type);
-				if (over) {
+						t)) {
 					exofs_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
 		}
 		exofs_put_page(page);
 	}
-
 	return 0;
 }
 
@@ -597,7 +591,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
 		goto fail;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr = kmap_atomic(page);
 	de = (struct exofs_dir_entry *)kaddr;
 	de->name_len = 1;
 	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
@@ -611,7 +605,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
 	de->inode_no = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
 	exofs_set_de_type(de, inode);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 	err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
 	page_cache_release(page);
@@ -669,5 +663,5 @@ not_empty:
 const struct file_operations exofs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= exofs_readdir,
+	.iterate	= exofs_readdir,
 };
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ca9d49665ef..fffe86fd7a4 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -56,6 +56,9 @@
 struct exofs_dev {
 	struct ore_dev ored;
 	unsigned did;
+	unsigned urilen;
+	uint8_t *uri;
+	struct kobject ed_kobj;
 };
 /*
  * our extension to the in-memory superblock
@@ -73,6 +76,7 @@ struct exofs_sb_info {
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
 	struct ore_components oc;		/* comps for the partition    */
+	struct kobject	s_kobj;			/* holds per-sbi kobject      */
 };
 
 /*
@@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
 			   const struct osd_obj_id *obj);
 int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 
+/* sys.c                 */
+int exofs_sysfs_init(void);
+void exofs_sysfs_uninit(void);
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev);
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi);
+int exofs_sysfs_odev_add(struct exofs_dev *edev,
+			 struct exofs_sb_info *sbi);
+void exofs_sysfs_dbg_print(void);
+
 /*********************
  * operation vectors *
  *********************/
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 491c6c078e7..71bf8e4fb5d 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
 
 const struct file_operations exofs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= generic_file_aio_read,
-	.aio_write	= generic_file_aio_write,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= exofs_release_file,
 	.fsync		= exofs_file_fsync,
 	.flush		= exofs_flush,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ea5e1f97806..3f9cafd7393 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,12 @@
 
 #define EXOFS_DBGMSG2(M...) do {} while (0)
 
-enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
-
 unsigned exofs_max_io_pages(struct ore_layout *layout,
 			    unsigned expected_pages)
 {
-	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+	unsigned pages = min_t(unsigned, expected_pages,
+			       layout->max_io_length / PAGE_SIZE);
 
-	/* TODO: easily support bio chaining */
-	pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
 	return pages;
 }
 
@@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol)
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = MAX_PAGES_KMALLOC;
+		pcol->expected_pages =
+				exofs_max_io_pages(&pcol->sbi->layout, ~0);
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
@@ -363,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
 	return 0;
 
 err:
-	if (!pcol->read_4_write)
-		_unlock_pcol_pages(pcol, ret, READ);
-
-	pcol_free(pcol);
-
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, READ);
+	pcol_free(pcol_copy);
 	kfree(pcol_copy);
+
 	return ret;
 }
 
@@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page)
 	size_t len;
 	int ret;
 
+	BUG_ON(!PageLocked(page));
+
 	/* FIXME: Just for debugging, will be removed */
 	if (PageUptodate(page))
 		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
@@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 
 	if (!pcol->that_locked_page ||
 	    (pcol->that_locked_page->index != index)) {
-		struct page *page = find_get_page(pcol->inode->i_mapping, index);
+		struct page *page;
+		loff_t i_size = i_size_read(pcol->inode);
 
+		if (offset >= i_size) {
+			*uptodate = true;
+			EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
+			return ZERO_PAGE(0);
+		}
+
+		page =  find_get_page(pcol->inode->i_mapping, index);
 		if (!page) {
 			page = find_or_create_page(pcol->inode->i_mapping,
 						   index, GFP_NOFS);
@@ -588,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 			*uptodate = true;
 		else
 			*uptodate = PageUptodate(page);
-		EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+		EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
 		return page;
 	} else {
-		EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+		EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
 			     pcol->that_locked_page->index);
 		*uptodate = true;
 		return pcol->that_locked_page;
@@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page)
 {
 	struct page_collect *pcol = priv;
 
-	if (pcol->that_locked_page != page) {
-		EXOFS_DBGMSG("index=0x%lx\n", page->index);
+	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
+		EXOFS_DBGMSG2("index=0x%lx\n", page->index);
 		page_cache_release(page);
 		return;
 	}
-	EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
+		     ZERO_PAGE(0) == page ? -1 : page->index);
 }
 
 static const struct _ore_r4w_op _r4w_op = {
@@ -667,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
 	return 0;
 
 err:
-	_unlock_pcol_pages(pcol, ret, WRITE);
-	pcol_free(pcol);
+	if (!pcol_copy) /* Failed before ownership transfer */
+		pcol_copy = pcol;
+	_unlock_pcol_pages(pcol_copy, ret, WRITE);
+	pcol_free(pcol_copy);
 	kfree(pcol_copy);
 
 	return ret;
@@ -850,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 static void _write_failed(struct inode *inode, loff_t to)
 {
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 
 int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -942,12 +953,22 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
 	return 0;
 }
 
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
-	EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+		     page->index, offset, length);
 	WARN_ON(1);
 }
 
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+		struct iov_iter *iter, loff_t offset)
+{
+	return 0;
+}
+
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
@@ -961,7 +982,7 @@ const struct address_space_operations exofs_aops = {
 
 	/* Not implemented Yet */
 	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
-	.direct_IO	= NULL, /* TODO: Should be trivial to do */
+	.direct_IO	= exofs_direct_IO,
 
 	/* With these NULL has special meaning or default is not exported */
 	.get_xip_mem	= NULL,
@@ -997,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
 	if (likely(!ret))
 		truncate_setsize(inode, newsize);
 
-	EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+	EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
 		     inode->i_ino, newsize, ret);
 	return ret;
 }
@@ -1081,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		/* If object is lost on target we might as well enable it's
 		 * delete.
 		 */
-		if ((ret == -ENOENT) || (ret == -EINVAL))
-			ret = 0;
+		ret = 0;
 		goto out;
 	}
 
 	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
 		goto out;
 	}
 	WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1096,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
 		goto out;
 	}
 	if (attrs[1].len) {
@@ -1111,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[2]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
 		goto out;
 	}
 	if (attrs[2].len) {
@@ -1163,8 +1183,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 
 	/* copy stuff from on-disk struct to in-memory struct */
 	inode->i_mode = le16_to_cpu(fcb.i_mode);
-	inode->i_uid = le32_to_cpu(fcb.i_uid);
-	inode->i_gid = le32_to_cpu(fcb.i_gid);
+	i_uid_write(inode, le32_to_cpu(fcb.i_uid));
+	i_gid_write(inode, le32_to_cpu(fcb.i_gid));
 	set_nlink(inode, le16_to_cpu(fcb.i_links_count));
 	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
 	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
@@ -1376,8 +1396,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	fcb = &args->fcb;
 
 	fcb->i_mode = cpu_to_le16(inode->i_mode);
-	fcb->i_uid = cpu_to_le32(inode->i_uid);
-	fcb->i_gid = cpu_to_le32(inode->i_gid);
+	fcb->i_uid = cpu_to_le32(i_uid_read(inode));
+	fcb->i_gid = cpu_to_le32(i_gid_read(inode));
 	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
 	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
@@ -1466,14 +1486,14 @@ void exofs_evict_inode(struct inode *inode)
 	struct ore_io_state *ios;
 	int ret;
 
-	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages_final(&inode->i_data);
 
 	/* TODO: should do better here */
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
 	inode->i_size = 0;
-	end_writeback(inode);
+	clear_inode(inode);
 
 	/* if we are deleting an obj that hasn't been created yet, wait.
 	 * This also makes sure that create_done cannot be called with an
@@ -1503,5 +1523,5 @@ void exofs_evict_inode(struct inode *inode)
 	return;
 
 no_delete:
-	end_writeback(inode);
+	clear_inode(inode);
 }
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 9dbf0c30103..4731fd991ef 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
 }
 
 static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+				   unsigned int flags)
 {
 	struct inode *inode;
 	ino_t ino;
@@ -60,7 +60,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
 }
 
 static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			 struct nameidata *nd)
+			 bool excl)
 {
 	struct inode *inode = exofs_new_inode(dir, mode);
 	int err = PTR_ERR(inode);
@@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
 {
 	struct inode *inode = old_dentry->d_inode;
 
-	if (inode->i_nlink >= EXOFS_LINK_MAX)
-		return -EMLINK;
-
 	inode->i_ctime = CURRENT_TIME;
 	inode_inc_link_count(inode);
 	ihold(inode);
@@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
 static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
-	int err = -EMLINK;
-
-	if (dir->i_nlink >= EXOFS_LINK_MAX)
-		goto out;
+	int err;
 
 	inode_inc_link_count(dir);
 
@@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (err)
 			goto out_dir;
 	} else {
-		if (dir_de) {
-			err = -EMLINK;
-			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
-				goto out_dir;
-		}
 		err = exofs_add_link(new_dentry, old_inode);
 		if (err)
 			goto out_dir;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 49cf230554a..cfc0205d62c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 		layout->parity = 1;
 		break;
 	case PNFS_OSD_RAID_PQ:
+		layout->parity = 2;
+		break;
 	case PNFS_OSD_RAID_4:
 	default:
-		ORE_ERR("Only RAID_0/5 for now\n");
+		ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+			layout->raid_algorithm);
 		return -EINVAL;
 	}
 	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
@@ -103,7 +106,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 
 	layout->max_io_length =
 		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
-							layout->group_width;
+					(layout->group_width - layout->parity);
 	if (layout->parity) {
 		unsigned stripe_length =
 				(layout->group_width - layout->parity) *
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 		layout->max_io_length /= stripe_length;
 		layout->max_io_length *= stripe_length;
 	}
+	ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
 	return 0;
 }
 EXPORT_SYMBOL(ore_verify_layout);
@@ -286,7 +291,8 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 	if (length) {
 		ore_calc_stripe_info(layout, offset, length, &ios->si);
 		ios->length = ios->si.length;
-		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+				 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
 		if (layout->parity)
 			_ore_post_alloc_raid_stuff(ios);
 	}
@@ -401,7 +407,7 @@ static void _clear_bio(struct bio *bio)
 	struct bio_vec *bv;
 	unsigned i;
 
-	__bio_for_each_segment(bv, bio, i, 0) {
+	bio_for_each_segment_all(bv, bio, i) {
 		unsigned this_count = bv->bv_len;
 
 		if (likely(PAGE_SIZE == this_count))
@@ -430,8 +436,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 		if (likely(!ret))
 			continue;
 
-		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
-			/* start read offset passed endof file */
+		if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+		    per_dev->bio) {
+			/* start read offset passed endof file.
+			 * Note: if we do not have bio it means read-attributes
+			 * In this case we should return error to caller.
+			 */
 			_clear_bio(per_dev->bio);
 			ORE_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
@@ -536,24 +546,28 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	u64	H = LmodS - G * T;
 
 	u32	N = div_u64(H, U);
+	u32	Nlast;
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
 	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	u32 first_dev = C - C % group_width;
 
 	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
 
 	si->obj_offset = si->unit_off + (N * stripe_unit) +
 				  (M * group_depth * stripe_unit);
+	si->cur_comp = C - first_dev;
+	si->cur_pg = si->unit_off / PAGE_SIZE;
 
 	if (parity) {
 		u32 LCMdP = lcm(group_width, parity) / parity;
 		/* R     = N % LCMdP; */
 		u32 RxP   = (N % LCMdP) * parity;
-		u32 first_dev = C - C % group_width;
 
 		si->par_dev = (group_width + group_width - parity - RxP) %
 			      group_width + first_dev;
-		si->dev = (group_width + C - RxP) % group_width + first_dev;
+		si->dev = (group_width + group_width + C - RxP) %
+			  group_width + first_dev;
 		si->bytes_in_stripe = U;
 		si->first_stripe_start = M * S + G * T + N * U;
 	} else {
@@ -568,6 +582,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	si->length = T - H;
 	if (si->length > length)
 		si->length = length;
+
+	Nlast = div_u64(H + si->length + U - 1, U);
+	si->maxdevUnits = Nlast - N;
+
 	si->M = M;
 }
 EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +601,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	int ret;
 
 	if (per_dev->bio == NULL) {
-		unsigned pages_in_stripe = ios->layout->group_width *
-					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
-					(ios->layout->group_width -
-					 ios->layout->parity);
-		unsigned bio_size = (nr_pages + pages_in_stripe) /
-					ios->layout->group_width;
+		unsigned bio_size;
+
+		if (!ios->reading) {
+			bio_size = ios->si.maxdevUnits;
+		} else {
+			bio_size = (ios->si.maxdevUnits + 1) *
+			     (ios->layout->group_width - ios->layout->parity) /
+			     ios->layout->group_width;
+		}
+		bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
@@ -609,8 +630,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
 		if (unlikely(pglen != added_len)) {
-			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
-				   per_dev->bio->bi_vcnt);
+			/* If bi_vcnt == bi_max then this is a SW BUG */
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+				   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+				   per_dev->bio->bi_vcnt,
+				   per_dev->bio->bi_max_vecs,
+				   BIO_MAX_PAGES_KMALLOC, cur_len);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -632,6 +657,43 @@ out:	/* we fail the complete unit on an error eg don't advance
 	return ret;
 }
 
+static int _add_parity_units(struct ore_io_state *ios,
+			     struct ore_striping_info *si,
+			     unsigned dev, unsigned first_dev,
+			     unsigned mirrors_p1, unsigned devs_in_group,
+			     unsigned cur_len)
+{
+	unsigned do_parity;
+	int ret = 0;
+
+	for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+		struct ore_per_dev_state *per_dev;
+
+		per_dev = &ios->per_dev[dev - first_dev];
+		if (!per_dev->length && !per_dev->offset) {
+			/* Only/always the parity unit of the first
+			 * stripe will be empty. So this is a chance to
+			 * initialize the per_dev info.
+			 */
+			per_dev->dev = dev;
+			per_dev->offset = si->obj_offset - si->unit_off;
+		}
+
+		ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+					   do_parity == 1);
+		if (unlikely(ret))
+				break;
+
+		if (do_parity != 1) {
+			dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+			si->cur_comp = (si->cur_comp + 1) %
+						       ios->layout->group_width;
+		}
+	}
+
+	return ret;
+}
+
 static int _prepare_for_striping(struct ore_io_state *ios)
 {
 	struct ore_striping_info *si = &ios->si;
@@ -641,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 	unsigned devs_in_group = group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned dev_order;
 	unsigned cur_pg = ios->pages_consumed;
 	u64 length = ios->length;
 	int ret = 0;
@@ -653,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 
 	BUG_ON(length > si->length);
 
-	dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
-	si->cur_comp = dev_order;
-	si->cur_pg = si->unit_off / PAGE_SIZE;
-
 	while (length) {
-		unsigned comp = dev - first_dev;
-		struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
+		struct ore_per_dev_state *per_dev =
+						&ios->per_dev[dev - first_dev];
 		unsigned cur_len, page_off = 0;
 
-		if (!per_dev->length) {
+		if (!per_dev->length && !per_dev->offset) {
+			/* First time initialize the per_dev info. */
 			per_dev->dev = dev;
 			if (dev == si->dev) {
 				WARN_ON(dev == si->par_dev);
@@ -671,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 				page_off = si->unit_off & ~PAGE_MASK;
 				BUG_ON(page_off && (page_off != ios->pgbase));
 			} else {
-				if (si->cur_comp > dev_order)
-					per_dev->offset =
-						si->obj_offset - si->unit_off;
-				else /* si->cur_comp < dev_order */
-					per_dev->offset =
-						si->obj_offset + stripe_unit -
-								   si->unit_off;
+				per_dev->offset = si->obj_offset - si->unit_off;
 				cur_len = stripe_unit;
 			}
 		} else {
@@ -691,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 		if (unlikely(ret))
 			goto out;
 
-		dev += mirrors_p1;
-		dev = (dev % devs_in_group) + first_dev;
-
 		length -= cur_len;
 
+		dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
 		si->cur_comp = (si->cur_comp + 1) % group_width;
 		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
 			if (!length && ios->sp2d) {
@@ -703,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 				 * stripe. then operate on parity dev.
 				 */
 				dev = si->par_dev;
-			}
-			if (ios->sp2d)
-				/* In writes cur_len just means if it's the
-				 * last one. See _ore_add_parity_unit.
-				 */
-				cur_len = length;
-			per_dev = &ios->per_dev[dev - first_dev];
-			if (!per_dev->length) {
-				/* Only/always the parity unit of the first
-				 * stripe will be empty. So this is a chance to
-				 * initialize the per_dev info.
-				 */
-				per_dev->dev = dev;
-				per_dev->offset = si->obj_offset - si->unit_off;
+				/* If last stripe operate on parity comp */
+				si->cur_comp = group_width - ios->layout->parity;
 			}
 
-			ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+			/* In writes cur_len just means if it's the
+			 * last one. See _ore_add_parity_unit.
+			 */
+			ret = _add_parity_units(ios, si, dev, first_dev,
+						mirrors_p1, devs_in_group,
+						ios->sp2d ? length : cur_len);
 			if (unlikely(ret))
 					goto out;
 
@@ -730,18 +773,14 @@ static int _prepare_for_striping(struct ore_io_state *ios)
 			/* Next stripe, start fresh */
 			si->cur_comp = 0;
 			si->cur_pg = 0;
+			si->obj_offset += cur_len;
+			si->unit_off = 0;
 		}
 	}
 out:
 	ios->numdevs = devs_in_group;
 	ios->pages_consumed = cur_pg;
-	if (unlikely(ret)) {
-		if (length == ios->length)
-			return ret;
-		else
-			ios->length -= length;
-	}
-	return 0;
+	return ret;
 }
 
 int ore_create(struct ore_io_state *ios)
@@ -820,8 +859,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 			struct bio *bio;
 
 			if (per_dev != master_dev) {
-				bio = bio_kmalloc(GFP_KERNEL,
-						  master_dev->bio->bi_max_vecs);
+				bio = bio_clone_kmalloc(master_dev->bio,
+							GFP_KERNEL);
 				if (unlikely(!bio)) {
 					ORE_DBGMSG(
 					      "Failed to allocate BIO size=%u\n",
@@ -830,7 +869,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 					goto out;
 				}
 
-				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
 				per_dev->offset = master_dev->offset;
@@ -843,11 +881,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 				bio->bi_rw |= REQ_WRITE;
 			}
 
-			osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
-				      bio, per_dev->length);
+			osd_req_write(or, _ios_obj(ios, cur_comp),
+				      per_dev->offset, bio, per_dev->length);
 			ORE_DBGMSG("write(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(_ios_obj(ios, dev)->id),
+				     _LLU(_ios_obj(ios, cur_comp)->id),
 				     _LLU(per_dev->offset),
 				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
@@ -859,20 +897,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 			       (ios->si.unit_off + ios->length >
 				ios->layout->stripe_unit));
 
-			ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
+			ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
 						 per_dev->offset,
 						 ios->kern_buff, ios->length);
 			if (unlikely(ret))
 				goto out;
 			ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(_ios_obj(ios, dev)->id),
+				     _LLU(_ios_obj(ios, cur_comp)->id),
 				     _LLU(per_dev->offset),
 				     _LLU(ios->length), per_dev->dev);
 		} else {
-			osd_req_set_attributes(or, _ios_obj(ios, dev));
+			osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
 			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-				     _LLU(_ios_obj(ios, dev)->id),
+				     _LLU(_ios_obj(ios, cur_comp)->id),
 				     ios->out_attr_len, dev);
 		}
 
@@ -1105,7 +1143,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		size_attr->attr = g_attr_logical_length;
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
-		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+		ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
 			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index d222c77cfa1..7f20f25c232 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
 #undef ORE_DBGMSG2
 #define ORE_DBGMSG2 ORE_DBGMSG
 
-struct page *_raid_page_alloc(void)
+static struct page *_raid_page_alloc(void)
 {
 	return alloc_page(GFP_KERNEL);
 }
 
-void _raid_page_free(struct page *p)
+static void _raid_page_free(struct page *p)
 {
 	__free_page(p);
 }
@@ -144,26 +144,26 @@ static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
 {
 	unsigned data_devs = sp2d->data_devs;
 	unsigned group_width = data_devs + sp2d->parity;
-	unsigned p;
+	int p, c;
 
 	if (!sp2d->needed)
 		return;
 
-	for (p = 0; p < sp2d->pages_in_unit; p++) {
-		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
-
-		if (_1ps->write_count < group_width) {
-			unsigned c;
+	for (c = data_devs - 1; c >= 0; --c)
+		for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
 
-			for (c = 0; c < data_devs; c++)
-				if (_1ps->page_is_read[c]) {
-					struct page *page = _1ps->pages[c];
+			if (_1ps->page_is_read[c]) {
+				struct page *page = _1ps->pages[c];
 
-					r4w->put_page(priv, page);
-					_1ps->page_is_read[c] = false;
-				}
+				r4w->put_page(priv, page);
+				_1ps->page_is_read[c] = false;
+			}
 		}
 
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
 		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
 		_1ps->write_count = 0;
 		_1ps->tx = NULL;
@@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
 
 static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
 {
-	unsigned p;
+	int p;
 
 	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
 		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
@@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
 static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
 {
 	unsigned p;
+	unsigned tx_flags = ASYNC_TX_ACK;
+
+	if (sp2d->parity == 1)
+		tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
 	for (p = 0; p < sp2d->pages_in_unit; p++) {
 		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
 
 		if (!_1ps->write_count)
 			continue;
 
-		init_async_submit(&_1ps->submit,
-			ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
-			NULL,
-			NULL, NULL,
-			(addr_conv_t *)_1ps->scribble);
-
-		/* TODO: raid6 */
-		_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
-				     0, sp2d->data_devs, PAGE_SIZE,
-				     &_1ps->submit);
+		init_async_submit(&_1ps->submit, tx_flags,
+			NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
+
+		if (sp2d->parity == 1)
+			_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+						_1ps->pages, 0, sp2d->data_devs,
+						PAGE_SIZE, &_1ps->submit);
+		else /* parity == 2 */
+			_1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+						sp2d->data_devs + sp2d->parity,
+						PAGE_SIZE, &_1ps->submit);
 	}
 
 	for (p = 0; p < sp2d->pages_in_unit; p++) {
@@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
 
 	ore_calc_stripe_info(ios->layout, *offset, 0, &si);
 
-	p = si.unit_off / PAGE_SIZE;
-	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
-		       ios->layout->mirrors_p1, si.par_dev, si.dev);
+	p = si.cur_pg;
+	c = si.cur_comp;
 	page = ios->sp2d->_1p_stripes[p].pages[c];
 
 	pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
@@ -432,7 +437,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 		if (!bio)
 			continue;
 
-		__bio_for_each_segment(bv, bio, i, 0) {
+		bio_for_each_segment_all(bv, bio, i) {
 			struct page *page = bv->bv_page;
 
 			SetPageUptodate(page);
@@ -461,16 +466,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
  * ios->sp2d[p][*], xor is calculated the same way. These pages are
  * allocated/freed and don't go through cache
  */
-static int _read_4_write(struct ore_io_state *ios)
+static int _read_4_write_first_stripe(struct ore_io_state *ios)
 {
-	struct ore_io_state *ios_read;
 	struct ore_striping_info read_si;
 	struct __stripe_pages_2d *sp2d = ios->sp2d;
 	u64 offset = ios->si.first_stripe_start;
-	u64 last_stripe_end;
-	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
-	unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
-	int ret;
+	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
 
 	if (offset == ios->offset) /* Go to start collect $200 */
 		goto read_last_stripe;
@@ -478,6 +479,9 @@ static int _read_4_write(struct ore_io_state *ios)
 	min_p = _sp2d_min_pg(sp2d);
 	max_p = _sp2d_max_pg(sp2d);
 
+	ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
+		   offset, ios->offset, min_p, max_p);
+
 	for (c = 0; ; c++) {
 		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
 		read_si.obj_offset += min_p * PAGE_SIZE;
@@ -512,6 +516,18 @@ static int _read_4_write(struct ore_io_state *ios)
 	}
 
 read_last_stripe:
+	return 0;
+}
+
+static int _read_4_write_last_stripe(struct ore_io_state *ios)
+{
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset;
+	u64 last_stripe_end;
+	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+	unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+
 	offset = ios->offset + ios->length;
 	if (offset % PAGE_SIZE)
 		_add_to_r4w_last_page(ios, &offset);
@@ -523,12 +539,8 @@ read_last_stripe:
 		goto read_it;
 
 	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
-	p = read_si.unit_off / PAGE_SIZE;
-	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
-		       ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
-
-	BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
-	/* unaligned IO must be within a single stripe */
+	p = read_si.cur_pg;
+	c = read_si.cur_comp;
 
 	if (min_p == sp2d->pages_in_unit) {
 		/* Didn't do it yet */
@@ -536,6 +548,9 @@ read_last_stripe:
 		max_p = _sp2d_max_pg(sp2d);
 	}
 
+	ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
+		   offset, last_stripe_end, min_p, max_p);
+
 	while (offset < last_stripe_end) {
 		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
 
@@ -568,6 +583,15 @@ read_last_stripe:
 	}
 
 read_it:
+	return 0;
+}
+
+static int _read_4_write_execute(struct ore_io_state *ios)
+{
+	struct ore_io_state *ios_read;
+	unsigned i;
+	int ret;
+
 	ios_read = ios->ios_read_4_write;
 	if (!ios_read)
 		return 0;
@@ -591,6 +615,8 @@ read_it:
 	}
 
 	_mark_read4write_pages_uptodate(ios_read, ret);
+	ore_put_io_state(ios_read);
+	ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */
 	return 0;
 }
 
@@ -598,7 +624,7 @@ read_it:
 int _ore_add_parity_unit(struct ore_io_state *ios,
 			    struct ore_striping_info *si,
 			    struct ore_per_dev_state *per_dev,
-			    unsigned cur_len)
+			    unsigned cur_len, bool do_xor)
 {
 	if (ios->reading) {
 		if (per_dev->cur_sg >= ios->sgs_per_dev) {
@@ -618,16 +644,18 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
 		si->cur_pg = _sp2d_min_pg(sp2d);
 		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
 
-		if (!cur_len) /* If last stripe operate on parity comp */
-			si->cur_comp = sp2d->data_devs;
-
 		if (!per_dev->length) {
 			per_dev->offset += si->cur_pg * PAGE_SIZE;
 			/* If first stripe, Read in all read4write pages
 			 * (if needed) before we calculate the first parity.
 			 */
-			_read_4_write(ios);
+			if (do_xor)
+				_read_4_write_first_stripe(ios);
 		}
+		if (!cur_len && do_xor)
+			/* If last stripe r4w pages of last stripe */
+			_read_4_write_last_stripe(ios);
+		_read_4_write_execute(ios);
 
 		for (i = 0; i < num_pages; i++) {
 			pages[i] = _raid_page_alloc();
@@ -637,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
 			++(ios->cur_par_page);
 		}
 
-		BUG_ON(si->cur_comp != sp2d->data_devs);
+		BUG_ON(si->cur_comp < sp2d->data_devs);
 		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
 
 		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
@@ -645,43 +673,24 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
 		if (unlikely(ret))
 			return ret;
 
-		/* TODO: raid6 if (last_parity_dev) */
-		_gen_xor_unit(sp2d);
-		_sp2d_reset(sp2d, ios->r4w, ios->private);
+		if (do_xor) {
+			_gen_xor_unit(sp2d);
+			_sp2d_reset(sp2d, ios->r4w, ios->private);
+		}
 	}
 	return 0;
 }
 
 int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
 {
-	struct ore_layout *layout = ios->layout;
-
 	if (ios->parity_pages) {
+		struct ore_layout *layout = ios->layout;
 		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-		unsigned stripe_size = ios->si.bytes_in_stripe;
-		u64 last_stripe, first_stripe;
 
 		if (_sp2d_alloc(pages_in_unit, layout->group_width,
 				layout->parity, &ios->sp2d)) {
 			return -ENOMEM;
 		}
-
-		/* Round io down to last full strip */
-		first_stripe = div_u64(ios->offset, stripe_size);
-		last_stripe = div_u64(ios->offset + ios->length, stripe_size);
-
-		/* If an IO spans more then a single stripe it must end at
-		 * a stripe boundary. The reminder at the end is pushed into the
-		 * next IO.
-		 */
-		if (last_stripe != first_stripe) {
-			ios->length = last_stripe * stripe_size - ios->offset;
-
-			BUG_ON(!ios->length);
-			ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
-					PAGE_SIZE;
-			ios->si.length = ios->length; /*make it consistent */
-		}
 	}
 	return 0;
 }
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index 2ffd2c3c6e4..cf6375d8212 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -31,24 +31,6 @@
 #define ORE_DBGMSG2(M...) do {} while (0)
 /* #define ORE_DBGMSG2 ORE_DBGMSG */
 
-/* Calculate the component order in a stripe. eg the logical data unit
- * address within the stripe of @dev given the @par_dev of this stripe.
- */
-static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
-				  unsigned par_dev, unsigned dev)
-{
-	unsigned first_dev = dev - dev % devs_in_group;
-
-	dev -= first_dev;
-	par_dev -= first_dev;
-
-	if (devs_in_group == par_dev) /* The raid 0 case */
-		return dev / mirrors_p1;
-	/* raid4/5/6 case */
-	return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
-	       mirrors_p1;
-}
-
 /* ios_raid.c stuff needed by ios.c */
 int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
 void _ore_free_raid_stuff(struct ore_io_state *ios);
@@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
 void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
 		 bool not_last);
 int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
-		     struct ore_per_dev_state *per_dev, unsigned cur_len);
+		     struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool do_xor);
 void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
 		       struct ore_striping_info *si, struct page *page);
 static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index d22cd168c6e..ed73ed8ebbe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
  */
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(exofs_inode_cachep);
 }
 
@@ -384,12 +389,10 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
 	if (unlikely(ret))
 		goto out;
 
-	lock_super(sb);
-
 	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
 	memset(fscb, 0, ios->length);
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
-	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+	fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles);
 	fscb->s_magic = cpu_to_le16(sb->s_magic);
 	fscb->s_newfs = 0;
 	fscb->s_version = EXOFS_FSCB_VER;
@@ -400,11 +403,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
 	ret = ore_write(ios);
 	if (unlikely(ret))
 		EXOFS_ERR("%s: ore_write failed.\n", __func__);
-	else
-		sb->s_dirt = 0;
 
-
-	unlock_super(sb);
 out:
 	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
 	ore_put_io_state(ios);
@@ -412,14 +411,6 @@ out:
 	return ret;
 }
 
-static void exofs_write_super(struct super_block *sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		exofs_sync_fs(sb, 1);
-	else
-		sb->s_dirt = 0;
-}
-
 static void _exofs_print_device(const char *msg, const char *dev_path,
 				struct osd_dev *od, u64 pid)
 {
@@ -472,6 +463,7 @@ static void exofs_put_super(struct super_block *sb)
 	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 
+	exofs_sysfs_sb_del(sbi);
 	bdi_destroy(&sbi->bdi);
 	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
@@ -529,7 +521,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 			     struct osd_dev_info *odi)
 {
 	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
-	memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
+	if (likely(odi->systemid_len))
+		memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN);
 
 	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
 	odi->osdname = dt_dev->osdname;
@@ -550,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 	return !(odi->systemid_len || odi->osdname_len);
 }
 
-int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
 		      struct exofs_dev **peds)
 {
 	struct __alloc_ore_devs_and_exofs_devs {
@@ -565,7 +558,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
 
 	aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
 	if (unlikely(!aoded)) {
-		EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+		EXOFS_ERR("ERROR: failed allocating Device array[%d]\n",
 			  numdevs);
 		return -ENOMEM;
 	}
@@ -631,6 +624,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
 		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
 
+	/* create sysfs subdir under which we put the device table
+	 * And cluster layout. A Superblock is identified by the string:
+	 *	"dev[0].osdname"_"pid"
+	 */
+	exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]);
+
 	for (i = 0; i < numdevs; i++) {
 		struct exofs_fscb fscb;
 		struct osd_dev_info odi;
@@ -656,6 +655,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 			eds[i].ored.od = fscb_od;
 			++sbi->oc.numdevs;
 			fscb_od = NULL;
+			exofs_sysfs_odev_add(&eds[i], sbi);
 			continue;
 		}
 
@@ -681,6 +681,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 				  odi.osdname);
 			goto out;
 		}
+		exofs_sysfs_odev_add(&eds[i], sbi);
 
 		/* TODO: verify other information is correct and FS-uuid
 		 *	 matches. Benny what did you say about device table
@@ -744,7 +745,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->one_comp.obj.partition = opts->pid;
 	sbi->one_comp.obj.id = 0;
 	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->oc.numdevs = 1;
 	sbi->oc.single_comp = EC_SINGLE_COMP;
 	sbi->oc.comps = &sbi->one_comp;
 
@@ -754,6 +754,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize = EXOFS_BLKSIZE;
 	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_max_links = EXOFS_LINK_MAX;
 	atomic_set(&sbi->s_curr_pending, 0);
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
@@ -802,6 +803,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 			goto free_sbi;
 
 		ore_comp_set_dev(&sbi->oc, 0, od);
+		sbi->oc.numdevs = 1;
 	}
 
 	__sbi_read_stats(sbi);
@@ -818,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = PTR_ERR(root);
 		goto free_sbi;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		EXOFS_ERR("ERROR: get root inode failed\n");
 		ret = -ENOMEM;
 		goto free_sbi;
@@ -843,6 +844,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
+	exofs_sysfs_dbg_print();
 	_exofs_print_device("Mounting", opts->dev_name,
 			    ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
@@ -941,7 +943,6 @@ static const struct super_operations exofs_sops = {
 	.write_inode    = exofs_write_inode,
 	.evict_inode    = exofs_evict_inode,
 	.put_super      = exofs_put_super,
-	.write_super    = exofs_write_super,
 	.sync_fs	= exofs_sync_fs,
 	.statfs         = exofs_statfs,
 };
@@ -1009,6 +1010,7 @@ static struct file_system_type exofs_type = {
 	.mount          = exofs_mount,
 	.kill_sb        = generic_shutdown_super,
 };
+MODULE_ALIAS_FS("exofs");
 
 static int __init init_exofs(void)
 {
@@ -1022,6 +1024,9 @@ static int __init init_exofs(void)
 	if (err)
 		goto out_d;
 
+	/* We don't fail if sysfs creation failed */
+	exofs_sysfs_init();
+
 	return 0;
 out_d:
 	destroy_inodecache();
@@ -1031,6 +1036,7 @@ out:
 
 static void __exit exit_exofs(void)
 {
+	exofs_sysfs_uninit();
 	unregister_filesystem(&exofs_type);
 	destroy_inodecache();
 }
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
new file mode 100644
index 00000000000..1b4f2f95fc3
--- /dev/null
+++ b/fs/exofs/sys.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2012
+ * Sachin Bhamare <sbhamare@panasas.com>
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License 2 as published by
+ * the Free Software Foundation.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the:
+ *	Free Software Foundation <licensing@fsf.org>
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "exofs.h"
+
+struct odev_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct exofs_dev *, char *);
+	ssize_t (*store)(struct exofs_dev *, const char *, size_t);
+};
+
+static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr,
+		char *buf)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->show ? a->show(edp, buf) : 0;
+}
+
+static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr,
+		const char *buf, size_t len)
+{
+	struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj);
+	struct odev_attr *a = container_of(attr, struct odev_attr, attr);
+
+	return a->store ? a->store(edp, buf, len) : len;
+}
+
+static const struct sysfs_ops odev_attr_ops = {
+	.show  = odev_attr_show,
+	.store = odev_attr_store,
+};
+
+
+static struct kset *exofs_kset;
+
+static ssize_t osdname_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname);
+}
+
+static ssize_t systemid_show(struct exofs_dev *edp, char *buf)
+{
+	struct osd_dev *odev = edp->ored.od;
+	const struct osd_dev_info *odi = osduld_device_info(odev);
+
+	memcpy(buf, odi->systemid, odi->systemid_len);
+	return odi->systemid_len;
+}
+
+static ssize_t uri_show(struct exofs_dev *edp, char *buf)
+{
+	return snprintf(buf, edp->urilen, "%s", edp->uri);
+}
+
+static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len)
+{
+	uint8_t *new_uri;
+
+	edp->urilen = strlen(buf) + 1;
+	new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL);
+	if (new_uri == NULL)
+		return -ENOMEM;
+	edp->uri = new_uri;
+	strncpy(edp->uri, buf, edp->urilen);
+	return edp->urilen;
+}
+
+#define OSD_ATTR(name, mode, show, store) \
+	static struct odev_attr odev_attr_##name = \
+					__ATTR(name, mode, show, store)
+
+OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL);
+OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL);
+OSD_ATTR(uri, S_IRWXU, uri_show, uri_store);
+
+static struct attribute *odev_attrs[] = {
+	&odev_attr_osdname.attr,
+	&odev_attr_systemid.attr,
+	&odev_attr_uri.attr,
+	NULL,
+};
+
+static struct kobj_type odev_ktype = {
+	.default_attrs	= odev_attrs,
+	.sysfs_ops	= &odev_attr_ops,
+};
+
+static struct kobj_type uuid_ktype = {
+};
+
+void exofs_sysfs_dbg_print(void)
+{
+#ifdef CONFIG_EXOFS_DEBUG
+	struct kobject *k_name, *k_tmp;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		printk(KERN_INFO "%s: name %s ref %d\n",
+			__func__, kobject_name(k_name),
+			(int)atomic_read(&k_name->kref.refcount));
+	}
+#endif
+}
+/*
+ * This function removes all kobjects under exofs_kset
+ * At the end of it, exofs_kset kobject will have a refcount
+ * of 1 which gets decremented only on exofs module unload
+ */
+void exofs_sysfs_sb_del(struct exofs_sb_info *sbi)
+{
+	struct kobject *k_name, *k_tmp;
+	struct kobject *s_kobj = &sbi->s_kobj;
+
+	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
+		/* Remove all that are children of this SBI */
+		if (k_name->parent == s_kobj)
+			kobject_put(k_name);
+	}
+	kobject_put(s_kobj);
+}
+
+/*
+ * This function creates sysfs entries to hold the current exofs cluster
+ * instance (uniquely identified by osdname,pid tuple).
+ * This function gets called once per exofs mount instance.
+ */
+int exofs_sysfs_sb_add(struct exofs_sb_info *sbi,
+		       struct exofs_dt_device_info *dt_dev)
+{
+	struct kobject *s_kobj;
+	int retval = 0;
+	uint64_t pid = sbi->one_comp.obj.partition;
+
+	/* allocate new uuid dirent */
+	s_kobj = &sbi->s_kobj;
+	s_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(s_kobj, &uuid_ktype,
+			&exofs_kset->kobj,  "%s_%llx", dt_dev->osdname, pid);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+			  "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi)
+{
+	struct kobject *d_kobj;
+	int retval = 0;
+
+	/* create osd device group which contains following attributes
+	 * osdname, systemid & uri
+	 */
+	d_kobj = &edev->ed_kobj;
+	d_kobj->kset = exofs_kset;
+	retval = kobject_init_and_add(d_kobj, &odev_ktype,
+			&sbi->s_kobj, "dev%u", edev->did);
+	if (retval) {
+		EXOFS_ERR("ERROR: Failed to create sysfs entry for "
+				"device dev%u\n", edev->did);
+		return retval;
+	}
+	return 0;
+}
+
+int exofs_sysfs_init(void)
+{
+	exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj);
+	if (!exofs_kset) {
+		EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void exofs_sysfs_uninit(void)
+{
+	kset_unregister(exofs_kset);
+}