Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "There are several patches from Ilya fixing RBD allocation lifecycle issues, a series adding a nocephx_sign_messages option (and associated bug fixes/cleanups), several patches from Zheng improving the (directory) fsync behavior, a big improvement in IO for direct-io requests when striping is enabled from Caifeng, and several other small fixes and cleanups" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: libceph: clear msg->con in ceph_msg_release() only libceph: add nocephx_sign_messages option libceph: stop duplicating client fields in messenger libceph: drop authorizer check from cephx msg signing routines libceph: msg signing callouts don't need con argument libceph: evaluate osd_req_op_data() arguments only once ceph: make fsync() wait unsafe requests that created/modified inode ceph: add request to i_unsafe_dirops when getting unsafe reply libceph: introduce ceph_x_authorizer_cleanup() ceph: don't invalidate page cache when inode is no longer used rbd: remove duplicate calls to rbd_dev_mapping_clear() rbd: set device_type::release instead of device::release rbd: don't free rbd_dev outside of the release callback rbd: return -ENOMEM instead of pool id if rbd_dev_create() fails libceph: use local variable cursor instead of &msg->cursor libceph: remove con argument in handle_reply() ceph: combine as many iovec as possile into one OSD request ceph: fix message length computation ceph: fix a comment typo rbd: drop null test before destroy functions
This commit is contained in:
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
|
||||
const struct ceph_inode_info* ci = cookie_netfs_data;
|
||||
uint16_t klen;
|
||||
|
||||
/* use ceph virtual inode (id + snaphot) */
|
||||
/* use ceph virtual inode (id + snapshot) */
|
||||
klen = sizeof(ci->i_vino);
|
||||
if (klen > maxbuf)
|
||||
return 0;
|
||||
|
@@ -1655,9 +1655,8 @@ retry_locked:
|
||||
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
|
||||
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
|
||||
inode->i_data.nrpages && /* have cached pages */
|
||||
(file_wanted == 0 || /* no open files */
|
||||
(revoking & (CEPH_CAP_FILE_CACHE|
|
||||
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
|
||||
(revoking & (CEPH_CAP_FILE_CACHE|
|
||||
CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
|
||||
!tried_invalidate) {
|
||||
dout("check_caps trying to invalidate on %p\n", inode);
|
||||
if (try_nonblocking_invalidate(inode) < 0) {
|
||||
@@ -1971,49 +1970,46 @@ out:
|
||||
}
|
||||
|
||||
/*
|
||||
* wait for any uncommitted directory operations to commit.
|
||||
* wait for any unsafe requests to complete.
|
||||
*/
|
||||
static int unsafe_dirop_wait(struct inode *inode)
|
||||
static int unsafe_request_wait(struct inode *inode)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct list_head *head = &ci->i_unsafe_dirops;
|
||||
struct ceph_mds_request *req;
|
||||
u64 last_tid;
|
||||
int ret = 0;
|
||||
|
||||
if (!S_ISDIR(inode->i_mode))
|
||||
return 0;
|
||||
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
|
||||
int ret, err = 0;
|
||||
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
if (list_empty(head))
|
||||
goto out;
|
||||
|
||||
req = list_last_entry(head, struct ceph_mds_request,
|
||||
r_unsafe_dir_item);
|
||||
last_tid = req->r_tid;
|
||||
|
||||
do {
|
||||
ceph_mdsc_get_request(req);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
|
||||
dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
|
||||
inode, req->r_tid, last_tid);
|
||||
ret = !wait_for_completion_timeout(&req->r_safe_completion,
|
||||
ceph_timeout_jiffies(req->r_timeout));
|
||||
if (ret)
|
||||
ret = -EIO; /* timed out */
|
||||
|
||||
ceph_mdsc_put_request(req);
|
||||
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
if (ret || list_empty(head))
|
||||
break;
|
||||
req = list_first_entry(head, struct ceph_mds_request,
|
||||
r_unsafe_dir_item);
|
||||
} while (req->r_tid < last_tid);
|
||||
out:
|
||||
if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
|
||||
req1 = list_last_entry(&ci->i_unsafe_dirops,
|
||||
struct ceph_mds_request,
|
||||
r_unsafe_dir_item);
|
||||
ceph_mdsc_get_request(req1);
|
||||
}
|
||||
if (!list_empty(&ci->i_unsafe_iops)) {
|
||||
req2 = list_last_entry(&ci->i_unsafe_iops,
|
||||
struct ceph_mds_request,
|
||||
r_unsafe_target_item);
|
||||
ceph_mdsc_get_request(req2);
|
||||
}
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
return ret;
|
||||
|
||||
dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
|
||||
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
|
||||
if (req1) {
|
||||
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
|
||||
ceph_timeout_jiffies(req1->r_timeout));
|
||||
if (ret)
|
||||
err = -EIO;
|
||||
ceph_mdsc_put_request(req1);
|
||||
}
|
||||
if (req2) {
|
||||
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
|
||||
ceph_timeout_jiffies(req2->r_timeout));
|
||||
if (ret)
|
||||
err = -EIO;
|
||||
ceph_mdsc_put_request(req2);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
dirty = try_flush_caps(inode, &flush_tid);
|
||||
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
|
||||
|
||||
ret = unsafe_dirop_wait(inode);
|
||||
ret = unsafe_request_wait(inode);
|
||||
|
||||
/*
|
||||
* only wait on non-file metadata writeback (the mds
|
||||
|
@@ -34,6 +34,74 @@
|
||||
* need to wait for MDS acknowledgement.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Calculate the length sum of direct io vectors that can
|
||||
* be combined into one page vector.
|
||||
*/
|
||||
static size_t dio_get_pagev_size(const struct iov_iter *it)
|
||||
{
|
||||
const struct iovec *iov = it->iov;
|
||||
const struct iovec *iovend = iov + it->nr_segs;
|
||||
size_t size;
|
||||
|
||||
size = iov->iov_len - it->iov_offset;
|
||||
/*
|
||||
* An iov can be page vectored when both the current tail
|
||||
* and the next base are page aligned.
|
||||
*/
|
||||
while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
|
||||
(++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
|
||||
size += iov->iov_len;
|
||||
}
|
||||
dout("dio_get_pagevlen len = %zu\n", size);
|
||||
return size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a page vector based on (@it, @nbytes).
|
||||
* The return value is the tuple describing a page vector,
|
||||
* that is (@pages, @page_align, @num_pages).
|
||||
*/
|
||||
static struct page **
|
||||
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
|
||||
size_t *page_align, int *num_pages)
|
||||
{
|
||||
struct iov_iter tmp_it = *it;
|
||||
size_t align;
|
||||
struct page **pages;
|
||||
int ret = 0, idx, npages;
|
||||
|
||||
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
|
||||
(PAGE_SIZE - 1);
|
||||
npages = calc_pages_for(align, nbytes);
|
||||
pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
|
||||
if (!pages) {
|
||||
pages = vmalloc(sizeof(*pages) * npages);
|
||||
if (!pages)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
for (idx = 0; idx < npages; ) {
|
||||
size_t start;
|
||||
ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
|
||||
npages - idx, &start);
|
||||
if (ret < 0)
|
||||
goto fail;
|
||||
|
||||
iov_iter_advance(&tmp_it, ret);
|
||||
nbytes -= ret;
|
||||
idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
}
|
||||
|
||||
BUG_ON(nbytes != 0);
|
||||
*num_pages = npages;
|
||||
*page_align = align;
|
||||
dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
|
||||
return pages;
|
||||
fail:
|
||||
ceph_put_page_vector(pages, idx, false);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare an open request. Preallocate ceph_cap to avoid an
|
||||
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
|
||||
size_t start;
|
||||
ssize_t n;
|
||||
|
||||
n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
|
||||
if (n < 0)
|
||||
return n;
|
||||
|
||||
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
n = dio_get_pagev_size(i);
|
||||
pages = dio_get_pages_alloc(i, n, &start, &num_pages);
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
|
||||
ret = striped_read(inode, off, n,
|
||||
pages, num_pages, checkeof,
|
||||
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
||||
CEPH_OSD_FLAG_WRITE;
|
||||
|
||||
while (iov_iter_count(from) > 0) {
|
||||
u64 len = iov_iter_single_seg_count(from);
|
||||
u64 len = dio_get_pagev_size(from);
|
||||
size_t start;
|
||||
ssize_t n;
|
||||
|
||||
@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
||||
|
||||
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
|
||||
|
||||
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
|
||||
if (unlikely(n < 0)) {
|
||||
ret = n;
|
||||
n = len;
|
||||
pages = dio_get_pages_alloc(from, len, &start, &num_pages);
|
||||
if (IS_ERR(pages)) {
|
||||
ceph_osdc_put_request(req);
|
||||
ret = PTR_ERR(pages);
|
||||
break;
|
||||
}
|
||||
|
||||
num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
/*
|
||||
* throw out any page cache pages in this range. this
|
||||
* may block.
|
||||
|
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
||||
|
||||
INIT_LIST_HEAD(&ci->i_unsafe_writes);
|
||||
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
|
||||
INIT_LIST_HEAD(&ci->i_unsafe_iops);
|
||||
spin_lock_init(&ci->i_unsafe_lock);
|
||||
|
||||
ci->i_snap_realm = NULL;
|
||||
|
@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
|
||||
mdsc->oldest_tid = req->r_tid;
|
||||
|
||||
if (dir) {
|
||||
struct ceph_inode_info *ci = ceph_inode(dir);
|
||||
|
||||
ihold(dir);
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
req->r_unsafe_dir = dir;
|
||||
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
|
||||
rb_erase(&req->r_node, &mdsc->request_tree);
|
||||
RB_CLEAR_NODE(&req->r_node);
|
||||
|
||||
if (req->r_unsafe_dir) {
|
||||
if (req->r_unsafe_dir && req->r_got_unsafe) {
|
||||
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
|
||||
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
list_del_init(&req->r_unsafe_dir_item);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
}
|
||||
if (req->r_target_inode && req->r_got_unsafe) {
|
||||
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
list_del_init(&req->r_unsafe_target_item);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
}
|
||||
|
||||
if (req->r_unsafe_dir) {
|
||||
iput(req->r_unsafe_dir);
|
||||
req->r_unsafe_dir = NULL;
|
||||
}
|
||||
@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
|
||||
if ((used | wanted) & CEPH_CAP_ANY_WR)
|
||||
goto out;
|
||||
}
|
||||
/* The inode has cached pages, but it's no longer used.
|
||||
* we can safely drop it */
|
||||
if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
|
||||
!(oissued & CEPH_CAP_FILE_CACHE)) {
|
||||
used = 0;
|
||||
oissued = 0;
|
||||
}
|
||||
if ((used | wanted) & ~oissued & mine)
|
||||
goto out; /* we need these caps */
|
||||
|
||||
@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
|
||||
/* we aren't the only cap.. just remove us */
|
||||
__ceph_remove_cap(cap, true);
|
||||
} else {
|
||||
/* try to drop referring dentries */
|
||||
/* try dropping referring dentries */
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
d_prune_aliases(inode);
|
||||
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
|
||||
@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
|
||||
req->r_started = jiffies;
|
||||
req->r_resend_mds = -1;
|
||||
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
|
||||
INIT_LIST_HEAD(&req->r_unsafe_target_item);
|
||||
req->r_fmode = -1;
|
||||
kref_init(&req->r_kref);
|
||||
INIT_LIST_HEAD(&req->r_wait);
|
||||
@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
|
||||
|
||||
len = sizeof(*head) +
|
||||
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
|
||||
sizeof(struct timespec);
|
||||
sizeof(struct ceph_timespec);
|
||||
|
||||
/* calculate (max) length for cap releases */
|
||||
len += sizeof(struct ceph_mds_request_release) *
|
||||
@@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
|
||||
} else {
|
||||
req->r_got_unsafe = true;
|
||||
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
|
||||
if (req->r_unsafe_dir) {
|
||||
struct ceph_inode_info *ci =
|
||||
ceph_inode(req->r_unsafe_dir);
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
list_add_tail(&req->r_unsafe_dir_item,
|
||||
&ci->i_unsafe_dirops);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
}
|
||||
}
|
||||
|
||||
dout("handle_reply tid %lld result %d\n", tid, result);
|
||||
@@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
if (realm)
|
||||
ceph_put_snap_realm(mdsc, realm);
|
||||
|
||||
if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
|
||||
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
}
|
||||
out_err:
|
||||
mutex_lock(&mdsc->mutex);
|
||||
if (!req->r_aborted) {
|
||||
@@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
|
||||
return msg;
|
||||
}
|
||||
|
||||
static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
|
||||
static int mds_sign_message(struct ceph_msg *msg)
|
||||
{
|
||||
struct ceph_mds_session *s = con->private;
|
||||
struct ceph_mds_session *s = msg->con->private;
|
||||
struct ceph_auth_handshake *auth = &s->s_auth;
|
||||
|
||||
return ceph_auth_sign_message(auth, msg);
|
||||
}
|
||||
|
||||
static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
|
||||
static int mds_check_message_signature(struct ceph_msg *msg)
|
||||
{
|
||||
struct ceph_mds_session *s = con->private;
|
||||
struct ceph_mds_session *s = msg->con->private;
|
||||
struct ceph_auth_handshake *auth = &s->s_auth;
|
||||
|
||||
return ceph_auth_check_message_signature(auth, msg);
|
||||
}
|
||||
|
||||
@@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
|
||||
.invalidate_authorizer = invalidate_authorizer,
|
||||
.peer_reset = peer_reset,
|
||||
.alloc_msg = mds_alloc_msg,
|
||||
.sign_message = sign_message,
|
||||
.check_message_signature = check_message_signature,
|
||||
.sign_message = mds_sign_message,
|
||||
.check_message_signature = mds_check_message_signature,
|
||||
};
|
||||
|
||||
/* eof */
|
||||
|
@@ -236,6 +236,9 @@ struct ceph_mds_request {
|
||||
struct inode *r_unsafe_dir;
|
||||
struct list_head r_unsafe_dir_item;
|
||||
|
||||
/* unsafe requests that modify the target inode */
|
||||
struct list_head r_unsafe_target_item;
|
||||
|
||||
struct ceph_mds_session *r_session;
|
||||
|
||||
int r_attempts; /* resend attempts */
|
||||
|
@@ -342,6 +342,7 @@ struct ceph_inode_info {
|
||||
|
||||
struct list_head i_unsafe_writes; /* uncommitted sync writes */
|
||||
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
|
||||
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
|
||||
spinlock_t i_unsafe_lock;
|
||||
|
||||
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
|
||||
|
Reference in New Issue
Block a user