libceph, rbd: ceph_osd_linger_request, watch/notify v2

This adds support and switches rbd to a new, more reliable version of
watch/notify protocol.  As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed.  watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().

A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request.  ceph_osd_event has been merged into
ceph_osd_linger_request.

All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.

ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.

Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Ilya Dryomov
2016-05-26 01:15:02 +02:00
parent c525f03601
commit 922dab6134
7 changed files with 1068 additions and 432 deletions

View File

@@ -153,8 +153,9 @@ struct ceph_dir_layout {
/* watch-notify operations */
enum {
WATCH_NOTIFY = 1, /* notifying watcher */
WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
};

View File

@@ -34,7 +34,7 @@ struct ceph_osd {
struct rb_node o_node;
struct ceph_connection o_con;
struct rb_root o_requests;
struct list_head o_linger_requests;
struct rb_root o_linger_requests;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
@@ -108,11 +108,12 @@ struct ceph_osd_req_op {
} cls;
struct {
u64 cookie;
u64 ver;
u32 prot_ver;
u32 timeout;
__u8 flag;
__u8 op; /* CEPH_OSD_WATCH_OP_ */
u32 gen;
} watch;
struct {
struct ceph_osd_data request_data;
} notify_ack;
struct {
u64 expected_object_size;
u64 expected_write_size;
@@ -145,8 +146,6 @@ struct ceph_osd_request_target {
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
struct list_head r_linger_item;
struct list_head r_linger_osd_item;
struct ceph_osd *r_osd;
struct ceph_osd_request_target r_t;
@@ -162,7 +161,6 @@ struct ceph_osd_request {
int r_result;
bool r_got_reply;
int r_linger;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
@@ -181,6 +179,7 @@ struct ceph_osd_request {
struct ceph_snap_context *r_snapc; /* for writes */
struct timespec r_mtime; /* ditto */
u64 r_data_offset; /* ditto */
bool r_linger; /* don't resend on failure */
/* internal */
unsigned long r_stamp; /* jiffies, send or check time */
@@ -195,23 +194,40 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc;
};
struct ceph_osd_event {
u64 cookie;
int one_shot;
struct ceph_osd_client *osdc;
void (*cb)(u64, u64, u8, void *);
void *data;
struct rb_node node;
struct list_head osd_node;
struct kref kref;
};
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
struct ceph_osd_event_work {
struct work_struct work;
struct ceph_osd_event *event;
u64 ver;
u64 notify_id;
u8 opcode;
struct ceph_osd_linger_request {
struct ceph_osd_client *osdc;
u64 linger_id;
bool committed;
struct ceph_osd *osd;
struct ceph_osd_request *reg_req;
struct ceph_osd_request *ping_req;
unsigned long ping_sent;
struct ceph_osd_request_target t;
u32 last_force_resend;
struct timespec mtime;
struct kref kref;
struct mutex lock;
struct rb_node node; /* osd */
struct rb_node osdc_node; /* osdc */
struct list_head scan_item;
struct completion reg_commit_wait;
int reg_commit_error;
int last_error;
u32 register_gen;
rados_watchcb2_t wcb;
rados_watcherrcb_t errcb;
void *data;
};
struct ceph_osd_client {
@@ -223,9 +239,10 @@ struct ceph_osd_client {
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
spinlock_t osd_lru_lock;
struct list_head req_linger; /* lingering requests */
struct ceph_osd homeless_osd;
atomic64_t last_tid; /* tid of last request */
u64 last_linger_id;
struct rb_root linger_requests; /* lingering requests */
atomic_t num_requests;
atomic_t num_homeless;
struct delayed_work timeout_work;
@@ -239,10 +256,6 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
spinlock_t event_lock;
struct rb_root event_tree;
u64 event_count;
struct workqueue_struct *notify_wq;
};
@@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode);
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode,
u64 cookie, u64 version, int flag);
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
@@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
u32 truncate_seq, u64 truncate_size,
bool use_mempool);
extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct timespec *mtime,
struct page **pages, int nr_pages);
/* watch/notify events */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
void (*event_cb)(u64, u64, u8, void *),
void *data, struct ceph_osd_event **pevent);
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
extern void ceph_osdc_put_event(struct ceph_osd_event *event);
/* watch/notify */
struct ceph_osd_linger_request *
ceph_osdc_watch(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
rados_watchcb2_t wcb,
rados_watcherrcb_t errcb,
void *data);
int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
struct ceph_osd_linger_request *lreq);
int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
u64 notify_id,
u64 cookie,
void *payload,
size_t payload_len);
#endif

View File

@@ -427,7 +427,17 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
#define RADOS_NOTIFY_VER 1
enum {
CEPH_OSD_WATCH_OP_UNWATCH = 0,
CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
/* note: use only ODD ids to prevent pre-giant code from
interpreting the op as UNWATCH */
CEPH_OSD_WATCH_OP_WATCH = 3,
CEPH_OSD_WATCH_OP_RECONNECT = 5,
CEPH_OSD_WATCH_OP_PING = 7,
};
const char *ceph_osd_watch_op_name(int o);
/*
* an individual object operation. each may be accompanied by some data
@@ -462,8 +472,9 @@ struct ceph_osd_op {
} __attribute__ ((packed)) snap;
struct {
__le64 cookie;
__le64 ver;
__u8 flag; /* 0 = unwatch, 1 = watch */
__le64 ver; /* no longer used */
__u8 op; /* CEPH_OSD_WATCH_OP_* */
__le32 gen; /* registration generation */
} __attribute__ ((packed)) watch;
struct {
__le64 offset, length;