diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 19:01:34 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 19:01:34 -0700 |
commit | f3270b16e00f0614fa418dcc50883da5949375b4 (patch) | |
tree | 2f76b9679cbc9e8da39c8a67cd2bc00084d441e7 | |
parent | 3044100e58c84e133791c8b60a2f5bef69d732e4 (diff) | |
parent | 2decd65a2630633cee04d0b83fdcee46ad2989a1 (diff) |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (48 commits)
ocfs2: Avoid to evaluate xattr block flags again.
ocfs2/cluster: Release debugfs file elapsed_time_in_ms
ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT writes.
Initialize max_slots early
When I tried to compile I got the following warning: fs/ocfs2/slot_map.c: In function ‘ocfs2_init_slot_info’: fs/ocfs2/slot_map.c:360: warning: ‘bytes’ may be used uninitialized in this function fs/ocfs2/slot_map.c:360: note: ‘bytes’ was declared here Compiler: gcc version 4.4.3 (GCC) on Mandriva I'm not sure why this warning occurs, I think compiler don't know that variable "bytes" is initialized when it is sent by reference to ocfs2_slot_map_physical_size and it throws that ugly warning. However, a simple initialization of "bytes" variable with 0 will fix it.
ocfs2: validate bg_free_bits_count after update
ocfs2/cluster: Bump up dlm protocol to version 1.1
ocfs2/cluster: Show per region heartbeat elapsed time
ocfs2/cluster: Add mlogs for heartbeat up/down events
ocfs2/cluster: Create debugfs dir/files for each region
ocfs2/cluster: Create debugfs files for live, quorum and failed region bitmaps
ocfs2/cluster: Maintain bitmap of failed regions
ocfs2/cluster: Maintain bitmap of quorum regions
ocfs2/cluster: Track bitmap of live heartbeat regions
ocfs2/cluster: Track number of global heartbeat regions
ocfs2/cluster: Maintain live node bitmap per heartbeat region
ocfs2/cluster: Reorganize o2hb debugfs init
ocfs2/cluster: Check slots for unconfigured live nodes
ocfs2/cluster: Print messages when adding/removing nodes
ocfs2/cluster: Print messages when adding/removing heartbeat regions
...
39 files changed, 1894 insertions, 173 deletions
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 1f7ae144f6d..5393e661169 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -87,3 +87,10 @@ dir_resv_level= (*) By default, directory reservations will scale with file reservations - users should rarely need to change this value. If allocation reservations are turned off, this option will have no effect. +coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode + lock will be taken to force other nodes drop cache, + therefore full cluster coherency is guaranteed even + for O_DIRECT writes. +coherency=buffered Allow concurrent O_DIRECT writes without EX lock among + nodes, which gains high performance at risk of getting + stale data on other nodes. diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5dbf4dba03c..a367dd04428 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1849,8 +1849,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - if (le32_to_cpu(es->s_blocks_count) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + if (generic_check_addressable(sb->s_blocksize_bits, + le32_to_cpu(es->s_blocks_count))) { ext3_msg(sb, KERN_ERR, "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 26147746c27..7f47c366bf1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2831,15 +2831,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - if ((ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || - (ext4_blocks_count(es) > - (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { + ret = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (ret) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = -EFBIG; goto failed_mount; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0e8014ea6b9..262419f83d8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; + /* Load journal superblock if it is not loaded yet. */ + if (journal->j_format_version == 0 && + journal_get_superblock(journal) != 0) + return 0; if (journal->j_format_version == 1) return 0; diff --git a/fs/libfs.c b/fs/libfs.c index 0a9da95317f..62baa0387d6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync) } EXPORT_SYMBOL(generic_file_fsync); +/** + * generic_check_addressable - Check addressability of file system + * @blocksize_bits: log of file system block size + * @num_blocks: number of blocks in file system + * + * Determine whether a file system with @num_blocks blocks (and a + * block size of 2**@blocksize_bits) is addressable by the sector_t + * and page cache of the system. Return 0 if so and -EFBIG otherwise. + */ +int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) +{ + u64 last_fs_block = num_blocks - 1; + u64 last_fs_page = + last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); + + if (unlikely(num_blocks == 0)) + return 0; + + if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) + return -EINVAL; + + if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || + (last_fs_page > (pgoff_t)(~0ULL))) { + return -EFBIG; + } + return 0; +} +EXPORT_SYMBOL(generic_check_addressable); + /* * No-op implementation of ->fsync for in-memory filesystems. */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0de69c9a08b..5cfeee11815 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -883,8 +883,8 @@ struct ocfs2_write_ctxt { * out in so that future reads from that region will get * zero's. */ - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; /* @@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) @@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, di_bh, + ret = ocfs2_refcount_cow(inode, filp, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); @@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93ffc51..7606f663da6 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 41d5f1f92d5..52c7557f3e2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); +/* + * In global heartbeat, we maintain a series of region bitmaps. + * - o2hb_region_bitmap allows us to limit the region number to max region. + * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). + * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes + * heartbeat on it. + * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. + */ +static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; +static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; + +#define O2HB_DB_TYPE_LIVENODES 0 +#define O2HB_DB_TYPE_LIVEREGIONS 1 +#define O2HB_DB_TYPE_QUORUMREGIONS 2 +#define O2HB_DB_TYPE_FAILEDREGIONS 3 +#define O2HB_DB_TYPE_REGION_LIVENODES 4 +#define O2HB_DB_TYPE_REGION_NUMBER 5 +#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 +struct o2hb_debug_buf { + int db_type; + int db_size; + int db_len; + void *db_data; +}; + +static struct o2hb_debug_buf *o2hb_db_livenodes; +static struct o2hb_debug_buf *o2hb_db_liveregions; +static struct o2hb_debug_buf *o2hb_db_quorumregions; +static struct o2hb_debug_buf *o2hb_db_failedregions; + #define O2HB_DEBUG_DIR "o2hb" #define O2HB_DEBUG_LIVENODES "livenodes" +#define O2HB_DEBUG_LIVEREGIONS "live_regions" +#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" +#define O2HB_DEBUG_FAILEDREGIONS "failed_regions" +#define O2HB_DEBUG_REGION_NUMBER "num" +#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" + static struct dentry *o2hb_debug_dir; static struct dentry *o2hb_debug_livenodes; +static struct dentry *o2hb_debug_liveregions; +static struct dentry *o2hb_debug_quorumregions; +static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); #define O2HB_DEFAULT_BLOCK_BITS 9 +enum o2hb_heartbeat_modes { + O2HB_HEARTBEAT_LOCAL = 0, + O2HB_HEARTBEAT_GLOBAL, + O2HB_HEARTBEAT_NUM_MODES, +}; + +char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ +}; + unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; +unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* Only sets a new threshold if there are no active regions. * @@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } +static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +{ + int ret = -1; + + if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) { + o2hb_heartbeat_mode = hb_mode; + ret = 0; + } + spin_unlock(&o2hb_live_lock); + } + + return ret; +} + struct o2hb_node_event { struct list_head hn_item; enum o2hb_callback_type hn_event_type; @@ -135,6 +204,18 @@ struct o2hb_region { struct block_device *hr_bdev; struct o2hb_disk_slot *hr_slots; + /* live node map of this region */ + unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned int hr_region_num; + + struct dentry *hr_debug_dir; + struct dentry *hr_debug_livenodes; + struct dentry *hr_debug_regnum; + struct dentry *hr_debug_elapsed_time; + struct o2hb_debug_buf *hr_db_livenodes; + struct o2hb_debug_buf *hr_db_regnum; + struct o2hb_debug_buf *hr_db_elapsed_time; + /* let the person setting up hb wait for it to return until it * has reached a 'steady' state. This will be fixed when we have * a more complete api that doesn't lead to this sort of fragility. */ @@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +static int o2hb_pop_count(void *map, int count) +{ + int i = -1, pop = 0; + + while ((i = find_next_bit(map, count, i + 1)) < count) + pop++; + return pop; +} + static void o2hb_write_timeout(struct work_struct *work) { + int failed, quorum; + unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work) mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " "milliseconds\n", reg->hr_dev_name, jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + + if (o2hb_global_heartbeat_active()) { + spin_lock_irqsave(&o2hb_live_lock, flags); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + O2NM_MAX_REGIONS); + quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + O2NM_MAX_REGIONS); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + + mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", + quorum, failed); + + /* + * Fence if the number of failed regions >= half the number + * of quorum regions + */ + if ((failed << 1) < quorum) + return; + } + o2quo_disk_timeout(); } @@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); + spin_unlock(&o2hb_live_lock); + } cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, @@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, { assert_spin_locked(&o2hb_live_lock); + BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); + event->hn_event_type = type; event->hn_node = node; event->hn_node_num = node_num; @@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } +static void o2hb_set_quorum_device(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + assert_spin_locked(&o2hb_live_lock); + + if (!o2hb_global_heartbeat_active()) + return; + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + return; + + /* + * A region can be added to the quorum only when it sees all + * live nodes heartbeat on it. In other words, the region has been + * added to all nodes. + */ + if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + sizeof(o2hb_live_node_bitmap))) + return; + + if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) + return; + + printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", + config_item_name(®->hr_item)); + + set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); +} + static int o2hb_check_slot(struct o2hb_region *reg, struct o2hb_disk_slot *slot) { @@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, u64 cputime; unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; + int tmp; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); - /* Is this correct? Do we assume that the node doesn't exist - * if we're not configured for him? */ + /* + * If a node is no longer configured but is still in the livemap, we + * may need to clear that bit from the livemap. + */ node = o2nm_get_node_by_num(slot->ds_node_num); - if (!node) - return 0; + if (!node) { + spin_lock(&o2hb_live_lock); + tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); + spin_unlock(&o2hb_live_lock); + if (!tmp) + return 0; + } if (!o2hb_verify_crc(reg, hb_block)) { /* all paths from here will drop o2hb_live_lock for @@ -639,8 +797,12 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", slot->ds_node_num, (long long)slot->ds_last_generation); + set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* first on the list generates a callback */ if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " + "bitmap\n", slot->ds_node_num); set_bit(slot->ds_node_num, o2hb_live_node_bitmap); o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, @@ -684,13 +846,18 @@ fire_callbacks: mlog(ML_HEARTBEAT, "Node %d left my region\n", slot->ds_node_num); + clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); + /* last off the live_slot generates a callback */ list_del_init(&slot->ds_live_item); if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " + "nodes bitmap\n", slot->ds_node_num); clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); - o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, - slot->ds_node_num); + /* node can be null */ + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, + node, slot->ds_node_num); changed = 1; } @@ -706,11 +873,14 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: + o2hb_set_quorum_device(reg, slot); + spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); - o2nm_node_put(node); + if (node) + o2nm_node_put(node); return changed; } @@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; ret = o2nm_configured_node_map(configured_nodes, @@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) return ret; } + /* + * If a node is not configured but is in the livemap, we still need + * to read the slot so as to be able to remove it from the livemap. + */ + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + set_bit(i, configured_nodes); + } + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); @@ -917,21 +1099,59 @@ static int o2hb_thread(void *data) #ifdef CONFIG_DEBUG_FS static int o2hb_debug_open(struct inode *inode, struct file *file) { + struct o2hb_debug_buf *db = inode->i_private; + struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; char *buf = NULL; int i = -1; int out = 0; + /* max_nodes should be the largest bitmap we pass here */ + BUG_ON(sizeof(map) < db->db_size); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto bail; - o2hb_fill_node_map(map, sizeof(map)); + switch (db->db_type) { + case O2HB_DB_TYPE_LIVENODES: + case O2HB_DB_TYPE_LIVEREGIONS: + case O2HB_DB_TYPE_QUORUMREGIONS: + case O2HB_DB_TYPE_FAILEDREGIONS: + spin_lock(&o2hb_live_lock); + memcpy(map, db->db_data, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_LIVENODES: + spin_lock(&o2hb_live_lock); + reg = (struct o2hb_region *)db->db_data; + memcpy(map, reg->hr_live_node_bitmap, db->db_size); + spin_unlock(&o2hb_live_lock); + break; + + case O2HB_DB_TYPE_REGION_NUMBER: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + reg->hr_region_num); + goto done; + + case O2HB_DB_TYPE_REGION_ELAPSED_TIME: + reg = (struct o2hb_region *)db->db_data; + out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + jiffies_to_msecs(jiffies - + reg->hr_last_timeout_start)); + goto done; + + default: + goto done; + } - while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); out += snprintf(buf + out, PAGE_SIZE - out, "\n"); +done: i_size_write(inode, out); file->private_data = buf; @@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - if (o2hb_debug_livenodes) - debugfs_remove(o2hb_debug_livenodes); - if (o2hb_debug_dir) - debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); + debugfs_remove(o2hb_debug_failedregions); + debugfs_remove(o2hb_debug_quorumregions); + debugfs_remove(o2hb_debug_liveregions); + debugfs_remove(o2hb_debug_livenodes); + debugfs_remove(o2hb_debug_dir); +} + +static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, + int type, int size, int len, void *data) +{ + *db = kmalloc(db_len, GFP_KERNEL); + if (!*db) + return NULL; + + (*db)->db_type = type; + (*db)->db_size = size; + (*db)->db_len = len; + (*db)->db_data = data; + + return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, + &o2hb_debug_fops); +} + +static int o2hb_debug_init(void) +{ + int ret = -ENOMEM; + + o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); + if (!o2hb_debug_dir) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, + o2hb_debug_dir, + &o2hb_db_livenodes, + sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, + sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, + o2hb_live_node_bitmap); + if (!o2hb_debug_livenodes) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, + o2hb_debug_dir, + &o2hb_db_liveregions, + sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); + if (!o2hb_debug_liveregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_quorumregions = + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, + o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); + if (!o2hb_debug_quorumregions) { + mlog_errno(ret); + goto bail; + } + + o2hb_debug_failedregions = + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, + o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), + O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); + if (!o2hb_debug_failedregions) { + mlog_errno(ret); + goto bail; + } + + ret = 0; +bail: + if (ret) + o2hb_exit(); + + return ret; } int o2hb_init(void) @@ -997,24 +1311,12 @@ int o2hb_init(void) INIT_LIST_HEAD(&o2hb_node_events); memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); + memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); + memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); + memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); + memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - - o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, - S_IFREG|S_IRUSR, - o2hb_debug_dir, NULL, - &o2hb_debug_fops); - if (!o2hb_debug_livenodes) { - mlog_errno(-ENOMEM); - debugfs_remove(o2hb_debug_dir); - return -ENOMEM; - } - - return 0; + return o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ @@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_slots) kfree(reg->hr_slots); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_livenodes); + debugfs_remove(reg->hr_debug_livenodes); + debugfs_remove(reg->hr_debug_regnum); + debugfs_remove(reg->hr_debug_elapsed_time); + debugfs_remove(reg->hr_debug_dir); + spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); @@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; + if (o2hb_global_heartbeat_active()) + set_bit(reg->hr_region_num, o2hb_live_region_bitmap); spin_unlock(&o2hb_live_lock); if (hb_task) @@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, else ret = -EIO; + if (hb_task && o2hb_global_heartbeat_active()) + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", + config_item_name(®->hr_item)); + out: if (filp) fput(filp); @@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } +static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +{ + int ret = -ENOMEM; + + reg->hr_debug_dir = + debugfs_create_dir(config_item_name(®->hr_item), dir); + if (!reg->hr_debug_dir) { + mlog_errno(ret); + goto bail; + } + + reg->hr_debug_livenodes = + o2hb_debug_create(O2HB_DEBUG_LIVENODES, + reg->hr_debug_dir, |