Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph update from Sage Weil:
 "There are a few fixes for snapshot behavior with CephFS and support
  for the new keepalive protocol from Zheng, a libceph fix that affects
  both RBD and CephFS, a few bug fixes and cleanups for RBD from Ilya,
  and several small fixes and cleanups from Jianpeng and others"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  ceph: improve readahead for file holes
  ceph: get inode size for each append write
  libceph: check data_len in ->alloc_msg()
  libceph: use keepalive2 to verify the mon session is alive
  rbd: plug rbd_dev->header.object_prefix memory leak
  rbd: fix double free on rbd_dev->header_name
  libceph: set 'exists' flag for newly up osd
  ceph: cleanup use of ceph_msg_get
  ceph: no need to get parent inode in ceph_open
  ceph: remove the useless judgement
  ceph: remove redundant test of head->safe and silence static analysis warnings
  ceph: fix queuing inode to mdsdir's snaprealm
  libceph: rename con_work() to ceph_con_workfn()
  libceph: Avoid holding the zero page on ceph_msgr_slab_init errors
  libceph: remove the unused macro AES_KEY_SIZE
  ceph: invalidate dirty pages after forced umount
  ceph: EIO all operations after forced umount
This commit is contained in:
Linus Torvalds
2015-09-11 12:33:03 -07:00
17 changed files with 191 additions and 98 deletions

View File

@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */

View File

@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
return 0;
}
#define AES_KEY_SIZE 16
static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
{
return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);

View File

@@ -163,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache;
static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK;
static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
#ifdef CONFIG_LOCKDEP
static struct lock_class_key socket_class;
@@ -176,7 +177,7 @@ static struct lock_class_key socket_class;
static void queue_con(struct ceph_connection *con);
static void cancel_con(struct ceph_connection *con);
static void con_work(struct work_struct *);
static void ceph_con_workfn(struct work_struct *);
static void con_fault(struct ceph_connection *con);
/*
@@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
ceph_msgr_slab_exit();
BUG_ON(zero_page == NULL);
page_cache_release(zero_page);
zero_page = NULL;
ceph_msgr_slab_exit();
}
int ceph_msgr_init(void)
{
if (ceph_msgr_slab_init())
return -ENOMEM;
BUG_ON(zero_page != NULL);
zero_page = ZERO_PAGE(0);
page_cache_get(zero_page);
if (ceph_msgr_slab_init())
return -ENOMEM;
/*
* The number of active work items is limited by the number of
* connections, so leave @max_active at default.
@@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
mutex_init(&con->mutex);
INIT_LIST_HEAD(&con->out_queue);
INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, con_work);
INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
con->state = CON_STATE_CLOSED;
}
@@ -1351,7 +1352,15 @@ static void prepare_write_keepalive(struct ceph_connection *con)
{
dout("prepare_write_keepalive %p\n", con);
con_out_kvec_reset(con);
con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
struct timespec ts = CURRENT_TIME;
struct ceph_timespec ceph_ts;
ceph_encode_timespec(&ceph_ts, &ts);
con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
} else {
con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
}
con_flag_set(con, CON_FLAG_WRITE_PENDING);
}
@@ -1625,6 +1634,12 @@ static void prepare_read_tag(struct ceph_connection *con)
con->in_tag = CEPH_MSGR_TAG_READY;
}
static void prepare_read_keepalive_ack(struct ceph_connection *con)
{
dout("prepare_read_keepalive_ack %p\n", con);
con->in_base_pos = 0;
}
/*
* Prepare to read a message.
*/
@@ -2322,13 +2337,6 @@ static int read_partial_message(struct ceph_connection *con)
return ret;
BUG_ON(!con->in_msg ^ skip);
if (con->in_msg && data_len > con->in_msg->data_length) {
pr_warn("%s skipping long message (%u > %zd)\n",
__func__, data_len, con->in_msg->data_length);
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
skip = 1;
}
if (skip) {
/* skip this message */
dout("alloc_msg said skip message\n");
@@ -2457,6 +2465,17 @@ static void process_message(struct ceph_connection *con)
mutex_lock(&con->mutex);
}
static int read_keepalive_ack(struct ceph_connection *con)
{
struct ceph_timespec ceph_ts;
size_t size = sizeof(ceph_ts);
int ret = read_partial(con, size, size, &ceph_ts);
if (ret <= 0)
return ret;
ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
prepare_read_tag(con);
return 1;
}
/*
* Write something to the socket. Called in a worker thread when the
@@ -2526,6 +2545,10 @@ more_kvec:
do_next:
if (con->state == CON_STATE_OPEN) {
if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
prepare_write_keepalive(con);
goto more;
}
/* is anything else pending? */
if (!list_empty(&con->out_queue)) {
prepare_write_message(con);
@@ -2535,10 +2558,6 @@ do_next:
prepare_write_ack(con);
goto more;
}
if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
prepare_write_keepalive(con);
goto more;
}
}
/* Nothing to do! */
@@ -2641,6 +2660,9 @@ more:
case CEPH_MSGR_TAG_ACK:
prepare_read_ack(con);
break;
case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
prepare_read_keepalive_ack(con);
break;
case CEPH_MSGR_TAG_CLOSE:
con_close_socket(con);
con->state = CON_STATE_CLOSED;
@@ -2684,6 +2706,12 @@ more:
process_ack(con);
goto more;
}
if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
ret = read_keepalive_ack(con);
if (ret <= 0)
goto out;
goto more;
}
out:
dout("try_read done on %p ret %d\n", con, ret);
@@ -2799,7 +2827,7 @@ static void con_fault_finish(struct ceph_connection *con)
/*
* Do some work on a connection. Drop a connection ref when we're done.
*/
static void con_work(struct work_struct *work)
static void ceph_con_workfn(struct work_struct *work)
{
struct ceph_connection *con = container_of(work, struct ceph_connection,
work.work);
@@ -3101,6 +3129,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
}
EXPORT_SYMBOL(ceph_con_keepalive);
bool ceph_con_keepalive_expired(struct ceph_connection *con,
unsigned long interval)
{
if (interval > 0 &&
(con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
struct timespec now = CURRENT_TIME;
struct timespec ts;
jiffies_to_timespec(interval, &ts);
ts = timespec_add(con->last_keepalive_ack, ts);
return timespec_compare(&now, &ts) >= 0;
}
return false;
}
static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
{
struct ceph_msg_data *data;

View File

@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr);
/* send an initial keepalive to ensure our timestamp is
* valid by the time we are in an OPENED state */
ceph_con_keepalive(&monc->con);
/* initiatiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
*/
static void __schedule_delayed(struct ceph_mon_client *monc)
{
unsigned int delay;
struct ceph_options *opt = monc->client->options;
unsigned long delay;
if (monc->cur_mon < 0 || __sub_expired(monc))
if (monc->cur_mon < 0 || __sub_expired(monc)) {
delay = 10 * HZ;
else
} else {
delay = 20 * HZ;
dout("__schedule_delayed after %u\n", delay);
schedule_delayed_work(&monc->delayed_work, delay);
if (opt->monc_ping_timeout > 0)
delay = min(delay, opt->monc_ping_timeout / 3);
}
dout("__schedule_delayed after %lu\n", delay);
schedule_delayed_work(&monc->delayed_work,
round_jiffies_relative(delay));
}
/*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
__close_session(monc);
__open_session(monc); /* continue hunting */
} else {
ceph_con_keepalive(&monc->con);
struct ceph_options *opt = monc->client->options;
int is_auth = ceph_auth_is_authenticated(monc->auth);
if (ceph_con_keepalive_expired(&monc->con,
opt->monc_ping_timeout)) {
dout("monc keepalive timeout\n");
is_auth = 0;
__close_session(monc);
monc->hunting = true;
__open_session(monc);
}
__validate_auth(monc);
if (!monc->hunting) {
ceph_con_keepalive(&monc->con);
__validate_auth(monc);
}
if (ceph_auth_is_authenticated(monc->auth))
if (is_auth)
__send_subscribe(monc);
}
__schedule_delayed(monc);

View File

@@ -2817,8 +2817,9 @@ out:
}
/*
* lookup and return message for incoming reply. set up reply message
* pages.
* Lookup and return message for incoming reply. Don't try to do
* anything about a larger than preallocated data portion of the
* message at the moment - for now, just skip the message.
*/
static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
@@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
mutex_lock(&osdc->request_mutex);
req = __lookup_request(osdc, tid);
if (!req) {
*skip = 1;
pr_warn("%s osd%d tid %llu unknown, skipping\n",
__func__, osd->o_osd, tid);
m = NULL;
dout("get_reply unknown tid %llu from osd%d\n", tid,
osd->o_osd);
*skip = 1;
goto out;
}
@@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
ceph_msg_revoke_incoming(req->r_reply);
if (front_len > req->r_reply->front_alloc_len) {
pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
front_len, req->r_reply->front_alloc_len,
(unsigned int)con->peer_name.type,
le64_to_cpu(con->peer_name.num));
pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
__func__, osd->o_osd, req->r_tid, front_len,
req->r_reply->front_alloc_len);
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
false);
if (!m)
@@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
ceph_msg_put(req->r_reply);
req->r_reply = m;
}
m = ceph_msg_get(req->r_reply);
if (data_len > 0) {
struct ceph_osd_data *osd_data;
/*
* XXX This is assuming there is only one op containing
* XXX page data. Probably OK for reads, but this
* XXX ought to be done more generally.
*/
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
if (osd_data->pages &&
unlikely(osd_data->length < data_len)) {
pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
tid, data_len, osd_data->length);
*skip = 1;
ceph_msg_put(m);
m = NULL;
goto out;
}
}
if (data_len > req->r_reply->data_length) {
pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
__func__, osd->o_osd, req->r_tid, data_len,
req->r_reply->data_length);
m = NULL;
*skip = 1;
goto out;
}
*skip = 0;
m = ceph_msg_get(req->r_reply);
dout("get_reply tid %lld %p\n", tid, m);
out:
mutex_unlock(&osdc->request_mutex);
return m;
}
static struct ceph_msg *alloc_msg(struct ceph_connection *con,

View File

@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
ceph_decode_addr(&addr);
pr_info("osd%d up\n", osd);
BUG_ON(osd >= map->max_osd);
map->osd_state[osd] |= CEPH_OSD_UP;
map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
map->osd_addr[osd] = addr;
}