libceph: fix messenger retry
In ancient times, the messenger could both initiate and accept connections. An artifact if that was data structures to store/process an incoming ceph_msg_connect request and send an outgoing ceph_msg_connect_reply. Sadly, the negotiation code was referencing those structures and ignoring important information (like the peer's connect_seq) from the correct ones. Among other things, this fixes tight reconnect loops where the server sends RETRY_SESSION and we (the client) retries with the same connect_seq as last time. This bug pretty easily triggered by injecting socket failures on the MDS and running some fs workload like workunits/direct_io/test_sync_io. Signed-off-by: Sage Weil <sage@inktank.com>
This commit is contained in:
@@ -172,16 +172,8 @@ struct ceph_connection {
|
|||||||
|
|
||||||
/* connection negotiation temps */
|
/* connection negotiation temps */
|
||||||
char in_banner[CEPH_BANNER_MAX_LEN];
|
char in_banner[CEPH_BANNER_MAX_LEN];
|
||||||
union {
|
struct ceph_msg_connect out_connect;
|
||||||
struct { /* outgoing connection */
|
struct ceph_msg_connect_reply in_reply;
|
||||||
struct ceph_msg_connect out_connect;
|
|
||||||
struct ceph_msg_connect_reply in_reply;
|
|
||||||
};
|
|
||||||
struct { /* incoming */
|
|
||||||
struct ceph_msg_connect in_connect;
|
|
||||||
struct ceph_msg_connect_reply out_reply;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
struct ceph_entity_addr actual_peer_addr;
|
struct ceph_entity_addr actual_peer_addr;
|
||||||
|
|
||||||
/* message out temps */
|
/* message out temps */
|
||||||
|
@@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con)
|
|||||||
* dropped messages.
|
* dropped messages.
|
||||||
*/
|
*/
|
||||||
dout("process_connect got RESET peer seq %u\n",
|
dout("process_connect got RESET peer seq %u\n",
|
||||||
le32_to_cpu(con->in_connect.connect_seq));
|
le32_to_cpu(con->in_reply.connect_seq));
|
||||||
pr_err("%s%lld %s connection reset\n",
|
pr_err("%s%lld %s connection reset\n",
|
||||||
ENTITY_NAME(con->peer_name),
|
ENTITY_NAME(con->peer_name),
|
||||||
ceph_pr_addr(&con->peer_addr.in_addr));
|
ceph_pr_addr(&con->peer_addr.in_addr));
|
||||||
@@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con)
|
|||||||
* If we sent a smaller connect_seq than the peer has, try
|
* If we sent a smaller connect_seq than the peer has, try
|
||||||
* again with a larger value.
|
* again with a larger value.
|
||||||
*/
|
*/
|
||||||
dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
|
dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
|
||||||
le32_to_cpu(con->out_connect.connect_seq),
|
le32_to_cpu(con->out_connect.connect_seq),
|
||||||
le32_to_cpu(con->in_connect.connect_seq));
|
le32_to_cpu(con->in_reply.connect_seq));
|
||||||
con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
|
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
|
||||||
ret = prepare_write_connect(con);
|
ret = prepare_write_connect(con);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
@@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con)
|
|||||||
*/
|
*/
|
||||||
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
|
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
|
||||||
con->peer_global_seq,
|
con->peer_global_seq,
|
||||||
le32_to_cpu(con->in_connect.global_seq));
|
le32_to_cpu(con->in_reply.global_seq));
|
||||||
get_global_seq(con->msgr,
|
get_global_seq(con->msgr,
|
||||||
le32_to_cpu(con->in_connect.global_seq));
|
le32_to_cpu(con->in_reply.global_seq));
|
||||||
ret = prepare_write_connect(con);
|
ret = prepare_write_connect(con);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
return ret;
|
return ret;
|
||||||
|
Reference in New Issue
Block a user