diff options
Diffstat (limited to 'fs/dlm/lock.c')
| -rw-r--r-- | fs/dlm/lock.c | 370 |
1 files changed, 294 insertions, 76 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d9ee1b96549..83f3d552030 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, log_error(ls, "find_rsb new from_other %d dir %d our %d %s", from_nodeid, dir_nodeid, our_nodeid, r->res_name); dlm_free_rsb(r); + r = NULL; error = -ENOTBLK; goto out_unlock; } @@ -1132,6 +1133,7 @@ static void toss_rsb(struct kref *kref) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; + ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK; if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; @@ -1182,7 +1184,7 @@ static void detach_lkb(struct dlm_lkb *lkb) static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) { struct dlm_lkb *lkb; - int rv, id; + int rv; lkb = dlm_allocate_lkb(ls); if (!lkb) @@ -1198,19 +1200,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); - retry: - rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS); - if (!rv) - return -ENOMEM; - + idr_preload(GFP_NOFS); spin_lock(&ls->ls_lkbidr_spin); - rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id); - if (!rv) - lkb->lkb_id = id; + rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT); + if (rv >= 0) + lkb->lkb_id = rv; spin_unlock(&ls->ls_lkbidr_spin); - - if (rv == -EAGAIN) - goto retry; + idr_preload_end(); if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); @@ -1624,65 +1620,184 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -/* FIXME: make this more efficient */ +/* If there's an rsb for the same resource being removed, ensure + that the remove message is sent before the new lookup message. + It should be rare to need a delay here, but if not, then it may + be worthwhile to add a proper wait mechanism rather than a delay. */ -static int shrink_bucket(struct dlm_ls *ls, int b) +static void wait_pending_remove(struct dlm_rsb *r) { - struct rb_node *n; + struct dlm_ls *ls = r->res_ls; + restart: + spin_lock(&ls->ls_remove_spin); + if (ls->ls_remove_len && + !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) { + log_debug(ls, "delay lookup for remove dir %d %s", + r->res_dir_nodeid, r->res_name); + spin_unlock(&ls->ls_remove_spin); + msleep(1); + goto restart; + } + spin_unlock(&ls->ls_remove_spin); +} + +/* + * ls_remove_spin protects ls_remove_name and ls_remove_len which are + * read by other threads in wait_pending_remove. ls_remove_names + * and ls_remove_lens are only used by the scan thread, so they do + * not need protection. + */ + +static void shrink_bucket(struct dlm_ls *ls, int b) +{ + struct rb_node *n, *next; struct dlm_rsb *r; + char *name; int our_nodeid = dlm_our_nodeid(); - int count = 0, found; + int remote_count = 0; + int need_shrink = 0; + int i, len, rv; - for (;;) { - found = 0; - spin_lock(&ls->ls_rsbtbl[b].lock); - for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) { - r = rb_entry(n, struct dlm_rsb, res_hashnode); + memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX); - /* If we're the directory record for this rsb, and - we're not the master of it, then we need to wait - for the master node to send us a dir remove for - before removing the dir record. */ + spin_lock(&ls->ls_rsbtbl[b].lock); - if (!dlm_no_directory(ls) && !is_master(r) && - (dlm_dir_nodeid(r) == our_nodeid)) { - continue; - } + if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + return; + } - if (!time_after_eq(jiffies, r->res_toss_time + - dlm_config.ci_toss_secs * HZ)) - continue; - found = 1; - break; + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) { + next = rb_next(n); + r = rb_entry(n, struct dlm_rsb, res_hashnode); + + /* If we're the directory record for this rsb, and + we're not the master of it, then we need to wait + for the master node to send us a dir remove for + before removing the dir record. */ + + if (!dlm_no_directory(ls) && + (r->res_master_nodeid != our_nodeid) && + (dlm_dir_nodeid(r) == our_nodeid)) { + continue; } - if (!found) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - break; + need_shrink = 1; + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + continue; } - if (kref_put(&r->res_ref, kill_rsb)) { - rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - spin_unlock(&ls->ls_rsbtbl[b].lock); + if (!dlm_no_directory(ls) && + (r->res_master_nodeid == our_nodeid) && + (dlm_dir_nodeid(r) != our_nodeid)) { /* We're the master of this rsb but we're not the directory record, so we need to tell the dir node to remove the dir record. */ - if (!dlm_no_directory(ls) && is_master(r) && - (dlm_dir_nodeid(r) != our_nodeid)) { - send_remove(r); - } + ls->ls_remove_lens[remote_count] = r->res_length; + memcpy(ls->ls_remove_names[remote_count], r->res_name, + DLM_RESNAME_MAXLEN); + remote_count++; - dlm_free_rsb(r); - count++; - } else { - spin_unlock(&ls->ls_rsbtbl[b].lock); + if (remote_count >= DLM_REMOVE_NAMES_MAX) + break; + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { log_error(ls, "tossed rsb in use %s", r->res_name); + continue; } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + dlm_free_rsb(r); } - return count; + if (need_shrink) + ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + else + ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK; + spin_unlock(&ls->ls_rsbtbl[b].lock); + + /* + * While searching for rsb's to free, we found some that require + * remote removal. We leave them in place and find them again here + * so there is a very small gap between removing them from the toss + * list and sending the removal. Keeping this gap small is + * important to keep us (the master node) from being out of sync + * with the remote dir node for very long. + * + * From the time the rsb is removed from toss until just after + * send_remove, the rsb name is saved in ls_remove_name. A new + * lookup checks this to ensure that a new lookup message for the + * same resource name is not sent just before the remove message. + */ + + for (i = 0; i < remote_count; i++) { + name = ls->ls_remove_names[i]; + len = ls->ls_remove_lens[i]; + + spin_lock(&ls->ls_rsbtbl[b].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name not toss %s", name); + continue; + } + + if (r->res_master_nodeid != our_nodeid) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name master %d dir %d our %d %s", + r->res_master_nodeid, r->res_dir_nodeid, + our_nodeid, name); + continue; + } + + if (r->res_dir_nodeid == our_nodeid) { + /* should never happen */ + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name dir %d master %d our %d %s", + r->res_dir_nodeid, r->res_master_nodeid, + our_nodeid, name); + continue; + } + + if (!time_after_eq(jiffies, r->res_toss_time + + dlm_config.ci_toss_secs * HZ)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_debug(ls, "remove_name toss_time %lu now %lu %s", + r->res_toss_time, jiffies, name); + continue; + } + + if (!kref_put(&r->res_ref, kill_rsb)) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "remove_name in use %s", name); + continue; + } + + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + + /* block lookup of same name until we've sent remove */ + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = len; + memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + send_remove(r); + + /* allow lookup of name again */ + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = 0; + memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + + dlm_free_rsb(r); + } } void dlm_scan_rsbs(struct dlm_ls *ls) @@ -1924,8 +2039,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; if (b == 1) { int len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); lkb->lkb_lvbseq = ms->m_lvbseq; } @@ -2174,10 +2289,14 @@ static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2) * immediate request, it is 0 if called later, after the lock has been * queued. * + * recover is 1 if dlm_recover_grant() is trying to grant conversions + * after recovery. + * * References are from chapter 6 of "VAXcluster Principles" by Roy Davis */ -static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) +static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, + int recover) { int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); @@ -2209,7 +2328,7 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) */ if (queue_conflict(&r->res_grantqueue, lkb)) - goto out; + return 0; /* * 6-3: By default, a conversion request is immediately granted if the @@ -2218,7 +2337,24 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) */ if (queue_conflict(&r->res_convertqueue, lkb)) - goto out; + return 0; + + /* + * The RECOVER_GRANT flag means dlm_recover_grant() is granting + * locks for a recovered rsb, on which lkb's have been rebuilt. + * The lkb's may have been rebuilt on the queues in a different + * order than they were in on the previous master. So, granting + * queued conversions in order after recovery doesn't make sense + * since the order hasn't been preserved anyway. The new order + * could also have created a new "in place" conversion deadlock. + * (e.g. old, failed master held granted EX, with PR->EX, NL->EX. + * After recovery, there would be no granted locks, and possibly + * NL->EX, PR->EX, an in-place conversion deadlock.) So, after + * recovery, grant conversions without considering order. + */ + + if (conv && recover) + return 1; /* * 6-5: But the default algorithm for deciding whether to grant or @@ -2255,7 +2391,7 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) if (list_empty(&r->res_convertqueue)) return 1; else - goto out; + return 0; } /* @@ -2301,12 +2437,12 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) if (!now && !conv && list_empty(&r->res_convertqueue) && first_in_list(lkb, &r->res_waitqueue)) return 1; - out: + return 0; } static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, - int *err) + int recover, int *err) { int rv; int8_t alt = 0, rqmode = lkb->lkb_rqmode; @@ -2315,7 +2451,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, if (err) *err = 0; - rv = _can_be_granted(r, lkb, now); + rv = _can_be_granted(r, lkb, now, recover); if (rv) goto out; @@ -2356,7 +2492,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, if (alt) { lkb->lkb_rqmode = alt; - rv = _can_be_granted(r, lkb, now); + rv = _can_be_granted(r, lkb, now, 0); if (rv) lkb->lkb_sbflags |= DLM_SBF_ALTMODE; else @@ -2380,6 +2516,7 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, unsigned int *count) { struct dlm_lkb *lkb, *s; + int recover = rsb_flag(r, RSB_RECOVER_GRANT); int hi, demoted, quit, grant_restart, demote_restart; int deadlk; @@ -2393,7 +2530,7 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, demoted = is_demoted(lkb); deadlk = 0; - if (can_be_granted(r, lkb, 0, &deadlk)) { + if (can_be_granted(r, lkb, 0, recover, &deadlk)) { grant_lock_pending(r, lkb); grant_restart = 1; if (count) @@ -2437,7 +2574,7 @@ static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, struct dlm_lkb *lkb, *s; list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { - if (can_be_granted(r, lkb, 0, NULL)) { + if (can_be_granted(r, lkb, 0, 0, NULL)) { grant_lock_pending(r, lkb); if (count) (*count)++; @@ -2608,6 +2745,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 0; } + wait_pending_remove(r); + r->res_first_lkid = lkb->lkb_id; send_lookup(r, lkb); return 1; @@ -2935,7 +3074,7 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) { int error = 0; - if (can_be_granted(r, lkb, 1, NULL)) { + if (can_be_granted(r, lkb, 1, 0, NULL)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); goto out; @@ -2975,7 +3114,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) /* changing an existing lock may allow others to be granted */ - if (can_be_granted(r, lkb, 1, &deadlk)) { + if (can_be_granted(r, lkb, 1, 0, &deadlk)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); goto out; @@ -3001,7 +3140,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) if (is_demoted(lkb)) { grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); - if (_can_be_granted(r, lkb, 1)) { + if (_can_be_granted(r, lkb, 1, 0)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); goto out; @@ -3755,8 +3894,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); } return 0; @@ -3871,12 +4010,70 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } +static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) +{ + char name[DLM_RESNAME_MAXLEN + 1]; + struct dlm_message *ms; + struct dlm_mhandle *mh; + struct dlm_rsb *r; + uint32_t hash, b; + int rv, dir_nodeid; + + memset(name, 0, sizeof(name)); + memcpy(name, ms_name, len); + + hash = jhash(name, len, 0); + b = hash & (ls->ls_rsbtbl_size - 1); + + dir_nodeid = dlm_hash2nodeid(ls, hash); + + log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); + + spin_lock(&ls->ls_rsbtbl[b].lock); + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); + if (!rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "repeat_remove on keep %s", name); + return; + } + + rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); + if (!rv) { + spin_unlock(&ls->ls_rsbtbl[b].lock); + log_error(ls, "repeat_remove on toss %s", name); + return; + } + + /* use ls->remove_name2 to avoid conflict with shrink? */ + + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = len; + memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); + spin_unlock(&ls->ls_rsbtbl[b].lock); + + rv = _create_message(ls, sizeof(struct dlm_message) + len, + dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); + if (rv) + return; + + memcpy(ms->m_extra, name, len); + ms->m_hash = hash; + + send_message(mh, ms); + + spin_lock(&ls->ls_remove_spin); + ls->ls_remove_len = 0; + memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); + spin_unlock(&ls->ls_remove_spin); +} + static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int from_nodeid; - int error, namelen; + int error, namelen = 0; from_nodeid = ms->m_header.h_nodeid; @@ -3944,13 +4141,21 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) delayed in being sent/arriving/being processed on the dir node. Another node would repeatedly lookup up the master, and the dir node would continue returning our nodeid until our send_remove - took effect. */ + took effect. + + We send another remove message in case our previous send_remove + was lost/ignored/missed somehow. */ if (error != -ENOTBLK) { log_limit(ls, "receive_request %x from %d %d", ms->m_lkid, from_nodeid, error); } + if (namelen && error == -EBADR) { + send_repeat_remove(ls, ms->m_extra, namelen); + msleep(1000); + } + setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); return error; @@ -5198,6 +5403,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, if ((lkb->lkb_nodeid == nodeid_gone) || dlm_is_removed(ls, lkb->lkb_nodeid)) { + /* tell recover_lvb to invalidate the lvb + because a node holding EX/PW failed */ + if ((lkb->lkb_exflags & DLM_LKF_VALBLK) && + (lkb->lkb_grmode >= DLM_LOCK_PW)) { + rsb_set_flag(r, RSB_RECOVER_LVB_INVAL); + } + del_lkb(r, lkb); /* this put should free the lkb */ @@ -5251,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls) up_write(&ls->ls_root_sem); if (lkb_count) - log_debug(ls, "dlm_recover_purge %u locks for %u nodes", + log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", lkb_count, nodes_count); } @@ -5266,9 +5478,10 @@ static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) if (!rsb_flag(r, RSB_RECOVER_GRANT)) continue; - rsb_clear_flag(r, RSB_RECOVER_GRANT); - if (!is_master(r)) + if (!is_master(r)) { + rsb_clear_flag(r, RSB_RECOVER_GRANT); continue; + } hold_rsb(r); spin_unlock(&ls->ls_rsbtbl[bucket].lock); return r; @@ -5313,7 +5526,9 @@ void dlm_recover_grant(struct dlm_ls *ls) rsb_count++; count = 0; lock_rsb(r); + /* the RECOVER_GRANT flag is checked in the grant path */ grant_pending_locks(r, &count); + rsb_clear_flag(r, RSB_RECOVER_GRANT); lkb_count += count; confirm_master(r, 0); unlock_rsb(r); @@ -5322,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls) } if (lkb_count) - log_debug(ls, "dlm_recover_grant %u locks on %u resources", + log_rinfo(ls, "dlm_recover_grant %u locks on %u resources", lkb_count, rsb_count); } @@ -5481,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) put_rsb(r); out: if (error && error != -EEXIST) - log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", + log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); rl->rl_result = cpu_to_le32(error); return error; @@ -5827,15 +6042,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) return error; } -/* The force flag allows the unlock to go ahead even if the lkb isn't granted. - Regardless of what rsb queue the lock is on, it's removed and freed. */ +/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't + granted. Regardless of what rsb queue the lock is on, it's removed and + freed. The IVVALBLK flag causes the lvb on the resource to be invalidated + if our lock is PW/EX (it's ignored if our granted mode is smaller.) */ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) { struct dlm_args args; int error; - set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); + set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK, + lkb->lkb_ua, &args); error = unlock_lock(ls, lkb, &args); if (error == -DLM_EUNLOCK) |
