diff options
Diffstat (limited to 'fs/exofs')
| -rw-r--r-- | fs/exofs/Kbuild | 2 | ||||
| -rw-r--r-- | fs/exofs/Kconfig.ore | 2 | ||||
| -rw-r--r-- | fs/exofs/dir.c | 42 | ||||
| -rw-r--r-- | fs/exofs/exofs.h | 14 | ||||
| -rw-r--r-- | fs/exofs/file.c | 10 | ||||
| -rw-r--r-- | fs/exofs/inode.c | 92 | ||||
| -rw-r--r-- | fs/exofs/namei.c | 17 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 174 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.c | 151 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.h | 21 | ||||
| -rw-r--r-- | fs/exofs/super.c | 50 | ||||
| -rw-r--r-- | fs/exofs/sys.c | 205 |
12 files changed, 520 insertions, 260 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 352ba149d23..389ba8312d5 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -16,5 +16,5 @@ libore-y := ore.o ore_raid.o obj-$(CONFIG_ORE) += libore.o -exofs-y := inode.o file.o symlink.o namei.o dir.o super.o +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba..2daf2329c28 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore @@ -9,4 +9,6 @@ config ORE tristate depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ default SCSI_OSD_ULD diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 80405836ba6..49f51ab4caa 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) } static int -exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +exofs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); + int need_revalidate = (file->f_version != inode->i_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; - types = exofs_filetype_table; - for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct exofs_dir_entry *de; @@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); @@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (offset) { offset = exofs_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct exofs_dir_entry *)(kaddr + offset); @@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) return -EIO; } if (de->inode_no) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < EXOFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < EXOFS_FT_MAX) + t = exofs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode_no), - d_type); - if (over) { + t)) { exofs_put_page(page); return 0; } } - filp->f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } exofs_put_page(page); } - return 0; } @@ -597,7 +591,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) goto fail; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); de = (struct exofs_dir_entry *)kaddr; de->name_len = 1; de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); @@ -611,7 +605,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); @@ -669,5 +663,5 @@ not_empty: const struct file_operations exofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = exofs_readdir, + .iterate = exofs_readdir, }; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index ca9d49665ef..fffe86fd7a4 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -56,6 +56,9 @@ struct exofs_dev { struct ore_dev ored; unsigned did; + unsigned urilen; + uint8_t *uri; + struct kobject ed_kobj; }; /* * our extension to the in-memory superblock @@ -73,6 +76,7 @@ struct exofs_sb_info { struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ + struct kobject s_kobj; /* holds per-sbi kobject */ }; /* @@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj); int exofs_sbi_write_stats(struct exofs_sb_info *sbi); +/* sys.c */ +int exofs_sysfs_init(void); +void exofs_sysfs_uninit(void); +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev); +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); +int exofs_sysfs_odev_add(struct exofs_dev *edev, + struct exofs_sb_info *sbi); +void exofs_sysfs_dbg_print(void); + /********************* * operation vectors * *********************/ diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 491c6c078e7..71bf8e4fb5d 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id) const struct file_operations exofs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .release = exofs_release_file, .fsync = exofs_file_fsync, .flush = exofs_flush, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations exofs_file_inode_operations = { diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ea5e1f97806..3f9cafd7393 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,15 +37,12 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; - unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) { - unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); + unsigned pages = min_t(unsigned, expected_pages, + layout->max_io_length / PAGE_SIZE); - /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE); return pages; } @@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol) * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = MAX_PAGES_KMALLOC; + pcol->expected_pages = + exofs_max_io_pages(&pcol->sbi->layout, ~0); } static int pcol_try_alloc(struct page_collect *pcol) @@ -363,12 +361,12 @@ static int read_exec(struct page_collect *pcol) return 0; err: - if (!pcol->read_4_write) - _unlock_pcol_pages(pcol, ret, READ); - - pcol_free(pcol); - + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, READ); + pcol_free(pcol_copy); kfree(pcol_copy); + return ret; } @@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page) size_t len; int ret; + BUG_ON(!PageLocked(page)); + /* FIXME: Just for debugging, will be removed */ if (PageUptodate(page)) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, @@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) if (!pcol->that_locked_page || (pcol->that_locked_page->index != index)) { - struct page *page = find_get_page(pcol->inode->i_mapping, index); + struct page *page; + loff_t i_size = i_size_read(pcol->inode); + if (offset >= i_size) { + *uptodate = true; + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); + return ZERO_PAGE(0); + } + + page = find_get_page(pcol->inode->i_mapping, index); if (!page) { page = find_or_create_page(pcol->inode->i_mapping, index, GFP_NOFS); @@ -588,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) *uptodate = true; else *uptodate = PageUptodate(page); - EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { - EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", pcol->that_locked_page->index); *uptodate = true; return pcol->that_locked_page; @@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page) { struct page_collect *pcol = priv; - if (pcol->that_locked_page != page) { - EXOFS_DBGMSG("index=0x%lx\n", page->index); + if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { + EXOFS_DBGMSG2("index=0x%lx\n", page->index); page_cache_release(page); return; } - EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index); + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", + ZERO_PAGE(0) == page ? -1 : page->index); } static const struct _ore_r4w_op _r4w_op = { @@ -667,8 +676,10 @@ static int write_exec(struct page_collect *pcol) return 0; err: - _unlock_pcol_pages(pcol, ret, WRITE); - pcol_free(pcol); + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, WRITE); + pcol_free(pcol_copy); kfree(pcol_copy); return ret; @@ -850,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) static void _write_failed(struct inode *inode, loff_t to) { if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } int exofs_write_begin(struct file *file, struct address_space *mapping, @@ -942,12 +953,22 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) return 0; } -static void exofs_invalidatepage(struct page *page, unsigned long offset) +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); WARN_ON(1); } + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, + struct iov_iter *iter, loff_t offset) +{ + return 0; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -961,7 +982,7 @@ const struct address_space_operations exofs_aops = { /* Not implemented Yet */ .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = NULL, /* TODO: Should be trivial to do */ + .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ .get_xip_mem = NULL, @@ -997,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) if (likely(!ret)) truncate_setsize(inode, newsize); - EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", inode->i_ino, newsize, ret); return ret; } @@ -1081,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, /* If object is lost on target we might as well enable it's * delete. */ - if ((ret == -ENOENT) || (ret == -EINVAL)) - ret = 0; + ret = 0; goto out; } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); goto out; } WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); @@ -1096,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); goto out; } if (attrs[1].len) { @@ -1111,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[2]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); goto out; } if (attrs[2].len) { @@ -1163,8 +1183,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) /* copy stuff from on-disk struct to in-memory struct */ inode->i_mode = le16_to_cpu(fcb.i_mode); - inode->i_uid = le32_to_cpu(fcb.i_uid); - inode->i_gid = le32_to_cpu(fcb.i_gid); + i_uid_write(inode, le32_to_cpu(fcb.i_uid)); + i_gid_write(inode, le32_to_cpu(fcb.i_gid)); set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); @@ -1376,8 +1396,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync) fcb = &args->fcb; fcb->i_mode = cpu_to_le16(inode->i_mode); - fcb->i_uid = cpu_to_le32(inode->i_uid); - fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_uid = cpu_to_le32(i_uid_read(inode)); + fcb->i_gid = cpu_to_le32(i_gid_read(inode)); fcb->i_links_count = cpu_to_le16(inode->i_nlink); fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); @@ -1466,14 +1486,14 @@ void exofs_evict_inode(struct inode *inode) struct ore_io_state *ios; int ret; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: should do better here */ if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; inode->i_size = 0; - end_writeback(inode); + clear_inode(inode); /* if we are deleting an obj that hasn't been created yet, wait. * This also makes sure that create_done cannot be called with an @@ -1503,5 +1523,5 @@ void exofs_evict_inode(struct inode *inode) return; no_delete: - end_writeback(inode); + clear_inode(inode); } diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 9dbf0c30103..4731fd991ef 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) } static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode; ino_t ino; @@ -60,7 +60,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, } static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { struct inode *inode = exofs_new_inode(dir, mode); int err = PTR_ERR(inode); @@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = old_dentry->d_inode; - if (inode->i_nlink >= EXOFS_LINK_MAX) - return -EMLINK; - inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); ihold(inode); @@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - int err = -EMLINK; - - if (dir->i_nlink >= EXOFS_LINK_MAX) - goto out; + int err; inode_inc_link_count(dir); @@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out_dir; } else { - if (dir_de) { - err = -EMLINK; - if (new_dir->i_nlink >= EXOFS_LINK_MAX) - goto out_dir; - } err = exofs_add_link(new_dentry, old_inode); if (err) goto out_dir; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 49cf230554a..cfc0205d62c 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->parity = 1; break; case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; case PNFS_OSD_RAID_4: default: - ORE_ERR("Only RAID_0/5 for now\n"); + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { @@ -103,7 +106,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - layout->group_width; + (layout->group_width - layout->parity); if (layout->parity) { unsigned stripe_length = (layout->group_width - layout->parity) * @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length /= stripe_length; layout->max_io_length *= stripe_length; } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -286,7 +291,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, if (length) { ore_calc_stripe_info(layout, offset, length, &ios->si); ios->length = ios->si.length; - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; if (layout->parity) _ore_post_alloc_raid_stuff(ios); } @@ -401,7 +407,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) @@ -430,8 +436,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) if (likely(!ret)) continue; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", @@ -536,24 +546,28 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u64 H = LmodS - G * T; u32 N = div_u64(H, U); + u32 Nlast; /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; if (parity) { u32 LCMdP = lcm(group_width, parity) / parity; /* R = N % LCMdP; */ u32 RxP = (N % LCMdP) * parity; - u32 first_dev = C - C % group_width; si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; - si->dev = (group_width + C - RxP) % group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; si->bytes_in_stripe = U; si->first_stripe_start = M * S + G * T + N * U; } else { @@ -568,6 +582,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->length = T - H; if (si->length > length) si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); @@ -583,13 +601,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, int ret; if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned nr_pages = ios->nr_pages * ios->layout->group_width / - (ios->layout->group_width - - ios->layout->parity); - unsigned bio_size = (nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -609,8 +630,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", - per_dev->bio->bi_vcnt); + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); ret = -ENOMEM; goto out; } @@ -632,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance return ret; } +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + static int _prepare_for_striping(struct ore_io_state *ios) { struct ore_striping_info *si = &ios->si; @@ -641,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -653,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); - si->cur_comp = dev_order; - si->cur_pg = si->unit_off / PAGE_SIZE; - while (length) { - unsigned comp = dev - first_dev; - struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; - if (!per_dev->length) { + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ per_dev->dev = dev; if (dev == si->dev) { WARN_ON(dev == si->par_dev); @@ -671,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); } else { - if (si->cur_comp > dev_order) - per_dev->offset = - si->obj_offset - si->unit_off; - else /* si->cur_comp < dev_order */ - per_dev->offset = - si->obj_offset + stripe_unit - - si->unit_off; + per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } } else { @@ -691,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (unlikely(ret)) goto out; - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - length -= cur_len; + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; si->cur_comp = (si->cur_comp + 1) % group_width; if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { if (!length && ios->sp2d) { @@ -703,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios) * stripe. then operate on parity dev. */ dev = si->par_dev; - } - if (ios->sp2d) - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - cur_len = length; - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; } - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; @@ -730,18 +773,14 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* Next stripe, start fresh */ si->cur_comp = 0; si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; } } out: ios->numdevs = devs_in_group; ios->pages_consumed = cur_pg; - if (unlikely(ret)) { - if (length == ios->length) - return ret; - else - ios->length -= length; - } - return 0; + return ret; } int ore_create(struct ore_io_state *ios) @@ -820,8 +859,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct bio *bio; if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_KERNEL, - master_dev->bio->bi_max_vecs); + bio = bio_clone_kmalloc(master_dev->bio, + GFP_KERNEL); if (unlikely(!bio)) { ORE_DBGMSG( "Failed to allocate BIO size=%u\n", @@ -830,7 +869,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; @@ -843,11 +881,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) bio->bi_rw |= REQ_WRITE; } - osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, - bio, per_dev->length); + osd_req_write(or, _ios_obj(ios, cur_comp), + per_dev->offset, bio, per_dev->length); ORE_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { @@ -859,20 +897,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) (ios->si.unit_off + ios->length > ios->layout->stripe_unit)); - ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), + ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), per_dev->offset, ios->kern_buff, ios->length); if (unlikely(ret)) goto out; ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), _LLU(per_dev->offset), _LLU(ios->length), per_dev->dev); } else { - osd_req_set_attributes(or, _ios_obj(ios, dev)); + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(_ios_obj(ios, dev)->id), + _LLU(_ios_obj(ios, cur_comp)->id), ios->out_attr_len, dev); } @@ -1105,7 +1143,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; - ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index d222c77cfa1..7f20f25c232 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -21,12 +21,12 @@ #undef ORE_DBGMSG2 #define ORE_DBGMSG2 ORE_DBGMSG -struct page *_raid_page_alloc(void) +static struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); } -void _raid_page_free(struct page *p) +static void _raid_page_free(struct page *p) { __free_page(p); } @@ -144,26 +144,26 @@ static void _sp2d_reset(struct __stripe_pages_2d *sp2d, { unsigned data_devs = sp2d->data_devs; unsigned group_width = data_devs + sp2d->parity; - unsigned p; + int p, c; if (!sp2d->needed) return; - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count < group_width) { - unsigned c; + for (c = data_devs - 1; c >= 0; --c) + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - for (c = 0; c < data_devs; c++) - if (_1ps->page_is_read[c]) { - struct page *page = _1ps->pages[c]; + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; - r4w->put_page(priv, page); - _1ps->page_is_read[c] = false; - } + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } } + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); _1ps->write_count = 0; _1ps->tx = NULL; @@ -203,7 +203,7 @@ static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) { - unsigned p; + int p; for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; @@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) { unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + for (p = 0; p < sp2d->pages_in_unit; p++) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; if (!_1ps->write_count) continue; - init_async_submit(&_1ps->submit, - ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, - NULL, - NULL, NULL, - (addr_conv_t *)_1ps->scribble); - - /* TODO: raid6 */ - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, - 0, sp2d->data_devs, PAGE_SIZE, - &_1ps->submit); + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); } for (p = 0; p < sp2d->pages_in_unit; p++) { @@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) ore_calc_stripe_info(ios->layout, *offset, 0, &si); - p = si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, si.par_dev, si.dev); + p = si.cur_pg; + c = si.cur_comp; page = ios->sp2d->_1p_stripes[p].pages[c]; pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); @@ -432,7 +437,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); @@ -461,16 +466,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) * ios->sp2d[p][*], xor is calculated the same way. These pages are * allocated/freed and don't go through cache */ -static int _read_4_write(struct ore_io_state *ios) +static int _read_4_write_first_stripe(struct ore_io_state *ios) { - struct ore_io_state *ios_read; struct ore_striping_info read_si; struct __stripe_pages_2d *sp2d = ios->sp2d; u64 offset = ios->si.first_stripe_start; - u64 last_stripe_end; - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; - unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; - int ret; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; if (offset == ios->offset) /* Go to start collect $200 */ goto read_last_stripe; @@ -478,6 +479,9 @@ static int _read_4_write(struct ore_io_state *ios) min_p = _sp2d_min_pg(sp2d); max_p = _sp2d_max_pg(sp2d); + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", + offset, ios->offset, min_p, max_p); + for (c = 0; ; c++) { ore_calc_stripe_info(ios->layout, offset, 0, &read_si); read_si.obj_offset += min_p * PAGE_SIZE; @@ -512,6 +516,18 @@ static int _read_4_write(struct ore_io_state *ios) } read_last_stripe: + return 0; +} + +static int _read_4_write_last_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + offset = ios->offset + ios->length; if (offset % PAGE_SIZE) _add_to_r4w_last_page(ios, &offset); @@ -523,12 +539,8 @@ read_last_stripe: goto read_it; ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); - - BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); - /* unaligned IO must be within a single stripe */ + p = read_si.cur_pg; + c = read_si.cur_comp; if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ @@ -536,6 +548,9 @@ read_last_stripe: max_p = _sp2d_max_pg(sp2d); } + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", + offset, last_stripe_end, min_p, max_p); + while (offset < last_stripe_end) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; @@ -568,6 +583,15 @@ read_last_stripe: } read_it: + return 0; +} + +static int _read_4_write_execute(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + unsigned i; + int ret; + ios_read = ios->ios_read_4_write; if (!ios_read) return 0; @@ -591,6 +615,8 @@ read_it: } _mark_read4write_pages_uptodate(ios_read, ret); + ore_put_io_state(ios_read); + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ return 0; } @@ -598,7 +624,7 @@ read_it: int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, - unsigned cur_len) + unsigned cur_len, bool do_xor) { if (ios->reading) { if (per_dev->cur_sg >= ios->sgs_per_dev) { @@ -618,16 +644,18 @@ int _ore_add_parity_unit(struct ore_io_state *ios, si->cur_pg = _sp2d_min_pg(sp2d); num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - if (!cur_len) /* If last stripe operate on parity comp */ - si->cur_comp = sp2d->data_devs; - if (!per_dev->length) { per_dev->offset += si->cur_pg * PAGE_SIZE; /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write(ios); + if (do_xor) + _read_4_write_first_stripe(ios); } + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ + _read_4_write_last_stripe(ios); + _read_4_write_execute(ios); for (i = 0; i < num_pages; i++) { pages[i] = _raid_page_alloc(); @@ -637,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, ++(ios->cur_par_page); } - BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_comp < sp2d->data_devs); BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, @@ -645,43 +673,24 @@ int _ore_add_parity_unit(struct ore_io_state *ios, if (unlikely(ret)) return ret; - /* TODO: raid6 if (last_parity_dev) */ - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } } return 0; } int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) { - struct ore_layout *layout = ios->layout; - if (ios->parity_pages) { + struct ore_layout *layout = ios->layout; unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - unsigned stripe_size = ios->si.bytes_in_stripe; - u64 last_stripe, first_stripe; if (_sp2d_alloc(pages_in_unit, layout->group_width, layout->parity, &ios->sp2d)) { return -ENOMEM; } - - /* Round io down to last full strip */ - first_stripe = div_u64(ios->offset, stripe_size); - last_stripe = div_u64(ios->offset + ios->length, stripe_size); - - /* If an IO spans more then a single stripe it must end at - * a stripe boundary. The reminder at the end is pushed into the - * next IO. - */ - if (last_stripe != first_stripe) { - ios->length = last_stripe * stripe_size - ios->offset; - - BUG_ON(!ios->length); - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / - PAGE_SIZE; - ios->si.length = ios->length; /*make it consistent */ - } } return 0; } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index 2ffd2c3c6e4..cf6375d8212 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -31,24 +31,6 @@ #define ORE_DBGMSG2(M...) do {} while (0) /* #define ORE_DBGMSG2 ORE_DBGMSG */ -/* Calculate the component order in a stripe. eg the logical data unit - * address within the stripe of @dev given the @par_dev of this stripe. - */ -static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, - unsigned par_dev, unsigned dev) -{ - unsigned first_dev = dev - dev % devs_in_group; - - dev -= first_dev; - par_dev -= first_dev; - - if (devs_in_group == par_dev) /* The raid 0 case */ - return dev / mirrors_p1; - /* raid4/5/6 case */ - return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / - mirrors_p1; -} - /* ios_raid.c stuff needed by ios.c */ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); void _ore_free_raid_stuff(struct ore_io_state *ios); @@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len); + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, struct ore_striping_info *si, struct page *page); static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index d22cd168c6e..ed73ed8ebbe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -206,6 +206,11 @@ static int init_inodecache(void) */ static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(exofs_inode_cachep); } @@ -384,12 +389,10 @@ static int exofs_sync_fs(struct super_block *sb, int wait) if (unlikely(ret)) goto out; - lock_super(sb); - ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; @@ -400,11 +403,7 @@ static int exofs_sync_fs(struct super_block *sb, int wait) ret = ore_write(ios); if (unlikely(ret)) EXOFS_ERR("%s: ore_write failed.\n", __func__); - else - sb->s_dirt = 0; - - unlock_super(sb); out: EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); ore_put_io_state(ios); @@ -412,14 +411,6 @@ out: return ret; } -static void exofs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - exofs_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - static void _exofs_print_device(const char *msg, const char *dev_path, struct osd_dev *od, u64 pid) { @@ -472,6 +463,7 @@ static void exofs_put_super(struct super_block *sb) _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); + exofs_sysfs_sb_del(sbi); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -529,7 +521,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, struct osd_dev_info *odi) { odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + if (likely(odi->systemid_len)) + memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); odi->osdname = dt_dev->osdname; @@ -550,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { struct __alloc_ore_devs_and_exofs_devs { @@ -565,7 +558,7 @@ int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); if (unlikely(!aoded)) { - EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", numdevs); return -ENOMEM; } @@ -631,6 +624,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], (numdevs - 1) * sizeof(sbi->oc.ods[0])); + /* create sysfs subdir under which we put the device table + * And cluster layout. A Superblock is identified by the string: + * "dev[0].osdname"_"pid" + */ + exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); + for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; struct osd_dev_info odi; @@ -656,6 +655,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, eds[i].ored.od = fscb_od; ++sbi->oc.numdevs; fscb_od = NULL; + exofs_sysfs_odev_add(&eds[i], sbi); continue; } @@ -681,6 +681,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, odi.osdname); goto out; } + exofs_sysfs_odev_add(&eds[i], sbi); /* TODO: verify other information is correct and FS-uuid * matches. Benny what did you say about device table @@ -744,7 +745,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->oc.numdevs = 1; sbi->oc.single_comp = EC_SINGLE_COMP; sbi->oc.comps = &sbi->one_comp; @@ -754,6 +754,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = EXOFS_BLKSIZE; sb->s_blocksize_bits = EXOFS_BLKSHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = EXOFS_LINK_MAX; atomic_set(&sbi->s_curr_pending, 0); sb->s_bdev = NULL; sb->s_dev = 0; @@ -802,6 +803,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; ore_comp_set_dev(&sbi->oc, 0, od); + sbi->oc.numdevs = 1; } __sbi_read_stats(sbi); @@ -818,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto free_sbi; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); EXOFS_ERR("ERROR: get root inode failed\n"); ret = -ENOMEM; goto free_sbi; @@ -843,6 +844,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } + exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); @@ -941,7 +943,6 @@ static const struct super_operations exofs_sops = { .write_inode = exofs_write_inode, .evict_inode = exofs_evict_inode, .put_super = exofs_put_super, - .write_super = exofs_write_super, .sync_fs = exofs_sync_fs, .statfs = exofs_statfs, }; @@ -1009,6 +1010,7 @@ static struct file_system_type exofs_type = { .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; +MODULE_ALIAS_FS("exofs"); static int __init init_exofs(void) { @@ -1022,6 +1024,9 @@ static int __init init_exofs(void) if (err) goto out_d; + /* We don't fail if sysfs creation failed */ + exofs_sysfs_init(); + return 0; out_d: destroy_inodecache(); @@ -1031,6 +1036,7 @@ out: static void __exit exit_exofs(void) { + exofs_sysfs_uninit(); unregister_filesystem(&exofs_type); destroy_inodecache(); } diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c new file mode 100644 index 00000000000..1b4f2f95fc3 --- /dev/null +++ b/fs/exofs/sys.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2012 + * Sachin Bhamare <sbhamare@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published by + * the Free Software Foundation. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the: + * Free Software Foundation <licensing@fsf.org> + */ + +#include <linux/kobject.h> +#include <linux/device.h> + +#include "exofs.h" + +struct odev_attr { + struct attribute attr; + ssize_t (*show)(struct exofs_dev *, char *); + ssize_t (*store)(struct exofs_dev *, const char *, size_t); +}; + +static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->show ? a->show(edp, buf) : 0; +} + +static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->store ? a->store(edp, buf, len) : len; +} + +static const struct sysfs_ops odev_attr_ops = { + .show = odev_attr_show, + .store = odev_attr_store, +}; + + +static struct kset *exofs_kset; + +static ssize_t osdname_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); +} + +static ssize_t systemid_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + memcpy(buf, odi->systemid, odi->systemid_len); + return odi->systemid_len; +} + +static ssize_t uri_show(struct exofs_dev *edp, char *buf) +{ + return snprintf(buf, edp->urilen, "%s", edp->uri); +} + +static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) +{ + uint8_t *new_uri; + + edp->urilen = strlen(buf) + 1; + new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + if (new_uri == NULL) + return -ENOMEM; + edp->uri = new_uri; + strncpy(edp->uri, buf, edp->urilen); + return edp->urilen; +} + +#define OSD_ATTR(name, mode, show, store) \ + static struct odev_attr odev_attr_##name = \ + __ATTR(name, mode, show, store) + +OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); +OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); +OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); + +static struct attribute *odev_attrs[] = { + &odev_attr_osdname.attr, + &odev_attr_systemid.attr, + &odev_attr_uri.attr, + NULL, +}; + +static struct kobj_type odev_ktype = { + .default_attrs = odev_attrs, + .sysfs_ops = &odev_attr_ops, +}; + +static struct kobj_type uuid_ktype = { +}; + +void exofs_sysfs_dbg_print(void) +{ +#ifdef CONFIG_EXOFS_DEBUG + struct kobject *k_name, *k_tmp; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + printk(KERN_INFO "%s: name %s ref %d\n", + __func__, kobject_name(k_name), + (int)atomic_read(&k_name->kref.refcount)); + } +#endif +} +/* + * This function removes all kobjects under exofs_kset + * At the end of it, exofs_kset kobject will have a refcount + * of 1 which gets decremented only on exofs module unload + */ +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) +{ + struct kobject *k_name, *k_tmp; + struct kobject *s_kobj = &sbi->s_kobj; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + /* Remove all that are children of this SBI */ + if (k_name->parent == s_kobj) + kobject_put(k_name); + } + kobject_put(s_kobj); +} + +/* + * This function creates sysfs entries to hold the current exofs cluster + * instance (uniquely identified by osdname,pid tuple). + * This function gets called once per exofs mount instance. + */ +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev) +{ + struct kobject *s_kobj; + int retval = 0; + uint64_t pid = sbi->one_comp.obj.partition; + + /* allocate new uuid dirent */ + s_kobj = &sbi->s_kobj; + s_kobj->kset = exofs_kset; + retval = kobject_init_and_add(s_kobj, &uuid_ktype, + &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); + return -ENOMEM; + } + return 0; +} + +int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) +{ + struct kobject *d_kobj; + int retval = 0; + + /* create osd device group which contains following attributes + * osdname, systemid & uri + */ + d_kobj = &edev->ed_kobj; + d_kobj->kset = exofs_kset; + retval = kobject_init_and_add(d_kobj, &odev_ktype, + &sbi->s_kobj, "dev%u", edev->did); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "device dev%u\n", edev->did); + return retval; + } + return 0; +} + +int exofs_sysfs_init(void) +{ + exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); + if (!exofs_kset) { + EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); + return -ENOMEM; + } + return 0; +} + +void exofs_sysfs_uninit(void) +{ + kset_unregister(exofs_kset); +} |
