ocfs2/dlm: continue to purge recovery lockres when recovery master goes down

We found a dlm-blocked situation caused by continuous breakdown of
recovery masters described below.  To solve this problem, we should
purge recovery lock once detecting recovery master goes down.

N3                      N2                   N1(reco master)
                        go down
                                             pick up recovery lock and
                                             begin recoverying for N2

                                             go down

pick up recovery
lock failed, then
purge it:
dlm_purge_lockres
  ->DROPPING_REF is set

send deref to N1 failed,
recovery lock is not purged

find N1 go down, begin
recoverying for N1, but
blocked in dlm_do_recovery
as DROPPING_REF is set:
dlm_do_recovery
  ->dlm_pick_recovery_master
    ->dlmlock
      ->dlm_get_lock_resource
        ->__dlm_wait_on_lockres_flags(tmpres,
	  	DLM_LOCK_RES_DROPPING_REF);

Fixes: 8c03439681 ("ocfs2/dlm: clear DROPPING_REF flag when the master goes down")
Link: http://lkml.kernel.org/r/578453AF.8030404@huawei.com
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
这个提交包含在:
piaojun
2016-08-02 14:02:19 -07:00
提交者 Linus Torvalds
父节点 309e91911d
当前提交 ee8f7fcbe6
修改 4 个文件,包含 74 行新增46 行删除

查看文件

@@ -2425,51 +2425,20 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
"but it is already derefed!\n", dlm->name,
res->lockname.len, res->lockname.name, node);
dlm_lockres_put(res);
ret = 0;
goto done;
}
if (!list_empty(&res->purge)) {
mlog(0, "%s: Removing res %.*s from purgelist\n",
dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
__dlm_print_one_lock_resource(res);
BUG();
}
__dlm_unhash_lockres(dlm, res);
spin_lock(&dlm->track_lock);
if (!list_empty(&res->tracking))
list_del_init(&res->tracking);
else {
mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
dlm->name, res->lockname.len, res->lockname.name);
__dlm_print_one_lock_resource(res);
}
spin_unlock(&dlm->track_lock);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource.
*/
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
__dlm_do_purge_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
dlm_lockres_put(res);
spin_unlock(&dlm->spinlock);
ret = 0;
done:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}