sgi-xpc: prevent false heartbeat failures

The heartbeat timeout functionality in sgi-xpc is currently not trained to
the connection time.  If a connection is made and the code is in the last
polling window prior to doing a timeout, the next polling window will see
the heartbeat as unchanged and initiate a no-heartbeat disconnect.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Dean Nelson <dcn@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Robin Holt
2009-04-13 14:40:18 -07:00
committed by Linus Torvalds
parent a06bba4643
commit a374c57b07
4 changed files with 124 additions and 153 deletions

View File

@@ -46,8 +46,7 @@ struct uv_IO_APIC_route_entry {
};
#endif
static atomic64_t xpc_heartbeat_uv;
static DECLARE_BITMAP(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV);
static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
#define XPC_ACTIVATE_MSG_SIZE_UV (1 * GRU_CACHE_LINE_BYTES)
#define XPC_ACTIVATE_MQ_SIZE_UV (4 * XP_MAX_NPARTITIONS_UV * \
@@ -423,41 +422,6 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
/* syncing of remote_act_state was just done above */
break;
case XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV: {
struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
msg = container_of(msg_hdr,
struct xpc_activate_mq_msg_heartbeat_req_uv,
hdr);
part_uv->heartbeat = msg->heartbeat;
break;
}
case XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV: {
struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
msg = container_of(msg_hdr,
struct xpc_activate_mq_msg_heartbeat_req_uv,
hdr);
part_uv->heartbeat = msg->heartbeat;
spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
part_uv->flags |= XPC_P_HEARTBEAT_OFFLINE_UV;
spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
break;
}
case XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV: {
struct xpc_activate_mq_msg_heartbeat_req_uv *msg;
msg = container_of(msg_hdr,
struct xpc_activate_mq_msg_heartbeat_req_uv,
hdr);
part_uv->heartbeat = msg->heartbeat;
spin_lock_irqsave(&part_uv->flags_lock, irq_flags);
part_uv->flags &= ~XPC_P_HEARTBEAT_OFFLINE_UV;
spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags);
break;
}
case XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV: {
struct xpc_activate_mq_msg_activate_req_uv *msg;
@@ -475,6 +439,7 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part,
part_uv->act_state_req = XPC_P_ASR_ACTIVATE_UV;
part->remote_rp_pa = msg->rp_gpa; /* !!! _pa is _gpa */
part->remote_rp_ts_jiffies = msg_hdr->rp_ts_jiffies;
part_uv->heartbeat_gpa = msg->heartbeat_gpa;
if (msg->activate_gru_mq_desc_gpa !=
part_uv->activate_gru_mq_desc_gpa) {
@@ -759,7 +724,7 @@ xpc_send_local_activate_IRQ_uv(struct xpc_partition *part, int act_state_req)
/*
* !!! Make our side think that the remote partition sent an activate
* !!! message our way by doing what the activate IRQ handler would
* !!! mq message our way by doing what the activate IRQ handler would
* !!! do had one really been sent.
*/
@@ -808,88 +773,80 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
static int
xpc_setup_rsvd_page_sn_uv(struct xpc_rsvd_page *rp)
{
rp->sn.activate_gru_mq_desc_gpa =
xpc_heartbeat_uv =
&xpc_partitions[sn_partition_id].sn.uv.cached_heartbeat;
rp->sn.uv.heartbeat_gpa = uv_gpa(xpc_heartbeat_uv);
rp->sn.uv.activate_gru_mq_desc_gpa =
uv_gpa(xpc_activate_mq_uv->gru_mq_desc);
return 0;
}
static void
xpc_send_heartbeat_uv(int msg_type)
xpc_allow_hb_uv(short partid)
{
short partid;
struct xpc_partition *part;
struct xpc_activate_mq_msg_heartbeat_req_uv msg;
}
/*
* !!! On uv we're broadcasting a heartbeat message every 5 seconds.
* !!! Whereas on sn2 we're bte_copy'ng the heartbeat info every 20
* !!! seconds. This is an increase in numalink traffic.
* ??? Is this good?
*/
static void
xpc_disallow_hb_uv(short partid)
{
}
msg.heartbeat = atomic64_inc_return(&xpc_heartbeat_uv);
partid = find_first_bit(xpc_heartbeating_to_mask_uv,
XP_MAX_NPARTITIONS_UV);
while (partid < XP_MAX_NPARTITIONS_UV) {
part = &xpc_partitions[partid];
xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
msg_type);
partid = find_next_bit(xpc_heartbeating_to_mask_uv,
XP_MAX_NPARTITIONS_UV, partid + 1);
}
static void
xpc_disallow_all_hbs_uv(void)
{
}
static void
xpc_increment_heartbeat_uv(void)
{
xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_INC_HEARTBEAT_UV);
xpc_heartbeat_uv->value++;
}
static void
xpc_offline_heartbeat_uv(void)
{
xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV);
xpc_increment_heartbeat_uv();
xpc_heartbeat_uv->offline = 1;
}
static void
xpc_online_heartbeat_uv(void)
{
xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_ONLINE_HEARTBEAT_UV);
xpc_increment_heartbeat_uv();
xpc_heartbeat_uv->offline = 0;
}
static void
xpc_heartbeat_init_uv(void)
{
atomic64_set(&xpc_heartbeat_uv, 0);
bitmap_zero(xpc_heartbeating_to_mask_uv, XP_MAX_NPARTITIONS_UV);
xpc_heartbeating_to_mask = &xpc_heartbeating_to_mask_uv[0];
xpc_heartbeat_uv->value = 1;
xpc_heartbeat_uv->offline = 0;
}
static void
xpc_heartbeat_exit_uv(void)
{
xpc_send_heartbeat_uv(XPC_ACTIVATE_MQ_MSG_OFFLINE_HEARTBEAT_UV);
xpc_offline_heartbeat_uv();
}
static enum xp_retval
xpc_get_remote_heartbeat_uv(struct xpc_partition *part)
{
struct xpc_partition_uv *part_uv = &part->sn.uv;
enum xp_retval ret = xpNoHeartbeat;
enum xp_retval ret;
if (part_uv->remote_act_state != XPC_P_AS_INACTIVE &&
part_uv->remote_act_state != XPC_P_AS_DEACTIVATING) {
ret = xp_remote_memcpy(uv_gpa(&part_uv->cached_heartbeat),
part_uv->heartbeat_gpa,
sizeof(struct xpc_heartbeat_uv));
if (ret != xpSuccess)
return ret;
if (part_uv->heartbeat != part->last_heartbeat ||
(part_uv->flags & XPC_P_HEARTBEAT_OFFLINE_UV)) {
if (part_uv->cached_heartbeat.value == part->last_heartbeat &&
!part_uv->cached_heartbeat.offline) {
part->last_heartbeat = part_uv->heartbeat;
ret = xpSuccess;
}
ret = xpNoHeartbeat;
} else {
part->last_heartbeat = part_uv->cached_heartbeat.value;
}
return ret;
}
@@ -904,8 +861,9 @@ xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp,
part->remote_rp_pa = remote_rp_gpa; /* !!! _pa here is really _gpa */
part->remote_rp_ts_jiffies = remote_rp->ts_jiffies;
part->sn.uv.heartbeat_gpa = remote_rp->sn.uv.heartbeat_gpa;
part->sn.uv.activate_gru_mq_desc_gpa =
remote_rp->sn.activate_gru_mq_desc_gpa;
remote_rp->sn.uv.activate_gru_mq_desc_gpa;
/*
* ??? Is it a good idea to make this conditional on what is
@@ -913,8 +871,9 @@ xpc_request_partition_activation_uv(struct xpc_rsvd_page *remote_rp,
*/
if (part->sn.uv.remote_act_state == XPC_P_AS_INACTIVE) {
msg.rp_gpa = uv_gpa(xpc_rsvd_page);
msg.heartbeat_gpa = xpc_rsvd_page->sn.uv.heartbeat_gpa;
msg.activate_gru_mq_desc_gpa =
xpc_rsvd_page->sn.activate_gru_mq_desc_gpa;
xpc_rsvd_page->sn.uv.activate_gru_mq_desc_gpa;
xpc_send_activate_IRQ_part_uv(part, &msg, sizeof(msg),
XPC_ACTIVATE_MQ_MSG_ACTIVATE_REQ_UV);
}
@@ -1677,6 +1636,10 @@ xpc_init_uv(void)
xpc_process_activate_IRQ_rcvd = xpc_process_activate_IRQ_rcvd_uv;
xpc_get_partition_rsvd_page_pa = xpc_get_partition_rsvd_page_pa_uv;
xpc_setup_rsvd_page_sn = xpc_setup_rsvd_page_sn_uv;
xpc_allow_hb = xpc_allow_hb_uv;
xpc_disallow_hb = xpc_disallow_hb_uv;
xpc_disallow_all_hbs = xpc_disallow_all_hbs_uv;
xpc_increment_heartbeat = xpc_increment_heartbeat_uv;
xpc_offline_heartbeat = xpc_offline_heartbeat_uv;
xpc_online_heartbeat = xpc_online_heartbeat_uv;