[DLM] block dlm_recv in recovery transition
Introduce a per-lockspace rwsem that's held in read mode by dlm_recv threads while working in the dlm. This allows dlm_recv activity to be suspended when the lockspace transitions to, from and between recovery cycles. The specific bug prompting this change is one where an in-progress recovery cycle is aborted by a new recovery cycle. While dlm_recv was processing a recovery message, the recovery cycle was aborted and dlm_recoverd began cleaning up. dlm_recv decremented recover_locks_count on an rsb after dlm_recoverd had reset it to zero. This is fixed by suspending dlm_recv (taking write lock on the rwsem) before aborting the current recovery. The transitions to/from normal and recovery modes are simplified by using this new ability to block dlm_recv. The switch from normal to recovery mode means dlm_recv goes from processing locking messages, to saving them for later, and vice versa. Races are avoided by blocking dlm_recv when setting the flag that switches between modes. Signed-off-by: David Teigland <teigland@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
This commit is contained in:

committed by
Steven Whitehouse

parent
b434eda6fd
commit
c36258b592
@@ -1,7 +1,7 @@
|
||||
/******************************************************************************
|
||||
*******************************************************************************
|
||||
**
|
||||
** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
|
||||
** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
|
||||
**
|
||||
** This copyrighted material is made available to anyone wishing to use,
|
||||
** modify, copy, or redistribute it subject to the terms and conditions
|
||||
@@ -20,7 +20,7 @@
|
||||
struct rq_entry {
|
||||
struct list_head list;
|
||||
int nodeid;
|
||||
char request[1];
|
||||
char request[0];
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -30,42 +30,39 @@ struct rq_entry {
|
||||
* lockspace is enabled on some while still suspended on others.
|
||||
*/
|
||||
|
||||
int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
|
||||
void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
|
||||
{
|
||||
struct rq_entry *e;
|
||||
int length = hd->h_length;
|
||||
int rv = 0;
|
||||
|
||||
e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
|
||||
if (!e) {
|
||||
log_print("dlm_add_requestqueue: out of memory\n");
|
||||
return 0;
|
||||
log_print("dlm_add_requestqueue: out of memory len %d", length);
|
||||
return;
|
||||
}
|
||||
|
||||
e->nodeid = nodeid;
|
||||
memcpy(e->request, hd, length);
|
||||
|
||||
/* We need to check dlm_locking_stopped() after taking the mutex to
|
||||
avoid a race where dlm_recoverd enables locking and runs
|
||||
process_requestqueue between our earlier dlm_locking_stopped check
|
||||
and this addition to the requestqueue. */
|
||||
|
||||
mutex_lock(&ls->ls_requestqueue_mutex);
|
||||
if (dlm_locking_stopped(ls))
|
||||
list_add_tail(&e->list, &ls->ls_requestqueue);
|
||||
else {
|
||||
log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
|
||||
kfree(e);
|
||||
rv = -EAGAIN;
|
||||
}
|
||||
list_add_tail(&e->list, &ls->ls_requestqueue);
|
||||
mutex_unlock(&ls->ls_requestqueue_mutex);
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by dlm_recoverd to process normal messages saved while recovery was
|
||||
* happening. Normal locking has been enabled before this is called. dlm_recv
|
||||
* upon receiving a message, will wait for all saved messages to be drained
|
||||
* here before processing the message it got. If a new dlm_ls_stop() arrives
|
||||
* while we're processing these saved messages, it may block trying to suspend
|
||||
* dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that
|
||||
* case, we don't abort since locking_stopped is still 0. If dlm_recv is not
|
||||
* waiting for us, then this processing may be aborted due to locking_stopped.
|
||||
*/
|
||||
|
||||
int dlm_process_requestqueue(struct dlm_ls *ls)
|
||||
{
|
||||
struct rq_entry *e;
|
||||
struct dlm_header *hd;
|
||||
int error = 0;
|
||||
|
||||
mutex_lock(&ls->ls_requestqueue_mutex);
|
||||
@@ -79,14 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
|
||||
e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
|
||||
mutex_unlock(&ls->ls_requestqueue_mutex);
|
||||
|
||||
hd = (struct dlm_header *) e->request;
|
||||
error = dlm_receive_message(hd, e->nodeid, 1);
|
||||
|
||||
if (error == -EINTR) {
|
||||
/* entry is left on requestqueue */
|
||||
log_debug(ls, "process_requestqueue abort eintr");
|
||||
break;
|
||||
}
|
||||
dlm_receive_message_saved(ls, (struct dlm_message *)e->request);
|
||||
|
||||
mutex_lock(&ls->ls_requestqueue_mutex);
|
||||
list_del(&e->list);
|
||||
@@ -106,10 +96,12 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
|
||||
|
||||
/*
|
||||
* After recovery is done, locking is resumed and dlm_recoverd takes all the
|
||||
* saved requests and processes them as they would have been by dlm_recvd. At
|
||||
* the same time, dlm_recvd will start receiving new requests from remote
|
||||
* nodes. We want to delay dlm_recvd processing new requests until
|
||||
* dlm_recoverd has finished processing the old saved requests.
|
||||
* saved requests and processes them as they would have been by dlm_recv. At
|
||||
* the same time, dlm_recv will start receiving new requests from remote nodes.
|
||||
* We want to delay dlm_recv processing new requests until dlm_recoverd has
|
||||
* finished processing the old saved requests. We don't check for locking
|
||||
* stopped here because dlm_ls_stop won't stop locking until it's suspended us
|
||||
* (dlm_recv).
|
||||
*/
|
||||
|
||||
void dlm_wait_requestqueue(struct dlm_ls *ls)
|
||||
@@ -118,8 +110,6 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
|
||||
mutex_lock(&ls->ls_requestqueue_mutex);
|
||||
if (list_empty(&ls->ls_requestqueue))
|
||||
break;
|
||||
if (dlm_locking_stopped(ls))
|
||||
break;
|
||||
mutex_unlock(&ls->ls_requestqueue_mutex);
|
||||
schedule();
|
||||
}
|
||||
|
Reference in New Issue
Block a user