Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "There are several patches from Ilya fixing RBD allocation lifecycle
  issues, a series adding a nocephx_sign_messages option (and associated
  bug fixes/cleanups), several patches from Zheng improving the
  (directory) fsync behavior, a big improvement in IO for direct-io
  requests when striping is enabled from Caifeng, and several other
  small fixes and cleanups"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: clear msg->con in ceph_msg_release() only
  libceph: add nocephx_sign_messages option
  libceph: stop duplicating client fields in messenger
  libceph: drop authorizer check from cephx msg signing routines
  libceph: msg signing callouts don't need con argument
  libceph: evaluate osd_req_op_data() arguments only once
  ceph: make fsync() wait unsafe requests that created/modified inode
  ceph: add request to i_unsafe_dirops when getting unsafe reply
  libceph: introduce ceph_x_authorizer_cleanup()
  ceph: don't invalidate page cache when inode is no longer used
  rbd: remove duplicate calls to rbd_dev_mapping_clear()
  rbd: set device_type::release instead of device::release
  rbd: don't free rbd_dev outside of the release callback
  rbd: return -ENOMEM instead of pool id if rbd_dev_create() fails
  libceph: use local variable cursor instead of &msg->cursor
  libceph: remove con argument in handle_reply()
  ceph: combine as many iovec as possile into one OSD request
  ceph: fix message length computation
  ceph: fix a comment typo
  rbd: drop null test before destroy functions
This commit is contained in:
Linus Torvalds
2015-11-13 09:24:40 -08:00
15 changed files with 315 additions and 223 deletions

View File

@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
/* use ceph virtual inode (id + snaphot) */
/* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;

View File

@@ -1655,9 +1655,8 @@ retry_locked:
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1970,46 @@ out:
}
/*
* wait for any uncommitted directory operations to commit.
* wait for any unsafe requests to complete.
*/
static int unsafe_dirop_wait(struct inode *inode)
static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct list_head *head = &ci->i_unsafe_dirops;
struct ceph_mds_request *req;
u64 last_tid;
int ret = 0;
if (!S_ISDIR(inode->i_mode))
return 0;
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
if (list_empty(head))
goto out;
req = list_last_entry(head, struct ceph_mds_request,
r_unsafe_dir_item);
last_tid = req->r_tid;
do {
ceph_mdsc_get_request(req);
spin_unlock(&ci->i_unsafe_lock);
dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
inode, req->r_tid, last_tid);
ret = !wait_for_completion_timeout(&req->r_safe_completion,
ceph_timeout_jiffies(req->r_timeout));
if (ret)
ret = -EIO; /* timed out */
ceph_mdsc_put_request(req);
spin_lock(&ci->i_unsafe_lock);
if (ret || list_empty(head))
break;
req = list_first_entry(head, struct ceph_mds_request,
r_unsafe_dir_item);
} while (req->r_tid < last_tid);
out:
if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
req1 = list_last_entry(&ci->i_unsafe_dirops,
struct ceph_mds_request,
r_unsafe_dir_item);
ceph_mdsc_get_request(req1);
}
if (!list_empty(&ci->i_unsafe_iops)) {
req2 = list_last_entry(&ci->i_unsafe_iops,
struct ceph_mds_request,
r_unsafe_target_item);
ceph_mdsc_get_request(req2);
}
spin_unlock(&ci->i_unsafe_lock);
return ret;
dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
err = -EIO;
ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
ceph_mdsc_put_request(req2);
}
return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
ret = unsafe_dirop_wait(inode);
ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds

View File

@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
/*
* Calculate the length sum of direct io vectors that can
* be combined into one page vector.
*/
static size_t dio_get_pagev_size(const struct iov_iter *it)
{
const struct iovec *iov = it->iov;
const struct iovec *iovend = iov + it->nr_segs;
size_t size;
size = iov->iov_len - it->iov_offset;
/*
* An iov can be page vectored when both the current tail
* and the next base are page aligned.
*/
while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
(++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
size += iov->iov_len;
}
dout("dio_get_pagevlen len = %zu\n", size);
return size;
}
/*
* Allocate a page vector based on (@it, @nbytes).
* The return value is the tuple describing a page vector,
* that is (@pages, @page_align, @num_pages).
*/
static struct page **
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
size_t *page_align, int *num_pages)
{
struct iov_iter tmp_it = *it;
size_t align;
struct page **pages;
int ret = 0, idx, npages;
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
(PAGE_SIZE - 1);
npages = calc_pages_for(align, nbytes);
pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
if (!pages) {
pages = vmalloc(sizeof(*pages) * npages);
if (!pages)
return ERR_PTR(-ENOMEM);
}
for (idx = 0; idx < npages; ) {
size_t start;
ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
npages - idx, &start);
if (ret < 0)
goto fail;
iov_iter_advance(&tmp_it, ret);
nbytes -= ret;
idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
}
BUG_ON(nbytes != 0);
*num_pages = npages;
*page_align = align;
dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
return pages;
fail:
ceph_put_page_vector(pages, idx, false);
return ERR_PTR(ret);
}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t start;
ssize_t n;
n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
if (n < 0)
return n;
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
n = dio_get_pagev_size(i);
pages = dio_get_pages_alloc(i, n, &start, &num_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
CEPH_OSD_FLAG_WRITE;
while (iov_iter_count(from) > 0) {
u64 len = iov_iter_single_seg_count(from);
u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
ret = n;
n = len;
pages = dio_get_pages_alloc(from, len, &start, &num_pages);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
ret = PTR_ERR(pages);
break;
}
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.

View File

@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;

View File

@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
ihold(dir);
spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
if (req->r_unsafe_dir) {
if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
}
if (req->r_target_inode && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_target_item);
spin_unlock(&ci->i_unsafe_lock);
}
if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
}
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
/* try to drop referring dentries */
/* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
sizeof(struct timespec);
sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci =
ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_dir_item,
&ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
dout("handle_reply tid %lld result %d\n", tid, result);
@@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
spin_unlock(&ci->i_unsafe_lock);
}
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
static int mds_sign_message(struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
return ceph_auth_sign_message(auth, msg);
}
static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
static int mds_check_message_signature(struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
.sign_message = sign_message,
.check_message_signature = check_message_signature,
.sign_message = mds_sign_message,
.check_message_signature = mds_check_message_signature,
};
/* eof */

View File

@@ -236,6 +236,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
/* unsafe requests that modify the target inode */
struct list_head r_unsafe_target_item;
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */

View File

@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */