Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "First pull request for this merge window, there will also be a
  followup request with some stragglers.

  This pull request contains:

   - Fix for a thundering heard issue in the wbt block code (Anchal
     Agarwal)

   - A few NVMe pull requests:
      * Improved tracepoints (Keith)
      * Larger inline data support for RDMA (Steve Wise)
      * RDMA setup/teardown fixes (Sagi)
      * Effects log suppor for NVMe target (Chaitanya Kulkarni)
      * Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
      * TP4004 (ANA) support (Christoph)
      * Various NVMe fixes

   - Block io-latency controller support. Much needed support for
     properly containing block devices. (Josef)

   - Series improving how we handle sense information on the stack
     (Kees)

   - Lightnvm fixes and updates/improvements (Mathias/Javier et al)

   - Zoned device support for null_blk (Matias)

   - AIX partition fixes (Mauricio Faria de Oliveira)

   - DIF checksum code made generic (Max Gurtovoy)

   - Add support for discard in iostats (Michael Callahan / Tejun)

   - Set of updates for BFQ (Paolo)

   - Removal of async write support for bsg (Christoph)

   - Bio page dirtying and clone fixups (Christoph)

   - Set of bcache fix/changes (via Coly)

   - Series improving blk-mq queue setup/teardown speed (Ming)

   - Series improving merging performance on blk-mq (Ming)

   - Lots of other fixes and cleanups from a slew of folks"

* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
  blkcg: Make blkg_root_lookup() work for queues in bypass mode
  bcache: fix error setting writeback_rate through sysfs interface
  null_blk: add lock drop/acquire annotation
  Blk-throttle: reduce tail io latency when iops limit is enforced
  block: paride: pd: mark expected switch fall-throughs
  block: Ensure that a request queue is dissociated from the cgroup controller
  block: Introduce blk_exit_queue()
  blkcg: Introduce blkg_root_lookup()
  block: Remove two superfluous #include directives
  blk-mq: count the hctx as active before allocating tag
  block: bvec_nr_vecs() returns value for wrong slab
  bcache: trivial - remove tailing backslash in macro BTREE_FLAG
  bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
  bcache: set max writeback rate when I/O request is idle
  bcache: add code comments for bset.c
  bcache: fix mistaken comments in request.c
  bcache: fix mistaken code comments in bcache.h
  bcache: add a comment in super.c
  bcache: avoid unncessary cache prefetch bch_btree_node_get()
  bcache: display rate debug parameters to 0 when writeback is not running
  ...
This commit is contained in:
Linus Torvalds
2018-08-14 10:23:25 -07:00
172 changed files with 6035 additions and 2665 deletions

View File

@@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req)
trace_nvme_complete_rq(req);
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
if (nvme_req_needs_failover(req, status)) {
if ((req->cmd_flags & REQ_NVME_MPATH) &&
blk_path_error(status)) {
nvme_failover_req(req);
return;
}
@@ -617,6 +618,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
} else if (req_op(req) == REQ_OP_WRITE) {
t10_pi_prepare(req, ns->pi_type);
}
switch (ns->pi_type) {
@@ -627,8 +630,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
cmnd->rw.reftag = cpu_to_le32(
nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
}
@@ -638,6 +640,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
return 0;
}
void nvme_cleanup_cmd(struct request *req)
{
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
nvme_req(req)->status == 0) {
struct nvme_ns *ns = req->rq_disk->private_data;
t10_pi_complete(req, ns->pi_type,
blk_rq_bytes(req) >> ns->lba_shift);
}
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
kfree(page_address(req->special_vec.bv_page) +
req->special_vec.bv_offset);
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd)
{
@@ -668,10 +686,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
}
cmd->common.command_id = req->tag;
if (ns)
trace_nvme_setup_nvm_cmd(req->q->id, cmd);
else
trace_nvme_setup_admin_cmd(cmd);
trace_nvme_setup_cmd(req, cmd);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -864,9 +879,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
if (unlikely(ctrl->kato == 0))
return;
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
@@ -1056,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -1472,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk,
set_capacity(disk, capacity);
nvme_config_discard(ns);
if (id->nsattr & (1 << 0))
set_disk_ro(disk, true);
else
set_disk_ro(disk, false);
blk_mq_unfreeze_queue(disk->queue);
}
@@ -2270,21 +2288,16 @@ out_unlock:
return ret;
}
int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 log_page, void *log,
size_t size, u64 offset)
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
unsigned long dwlen = size / 4 - 1;
c.get_log_page.opcode = nvme_admin_get_log_page;
if (ns)
c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
else
c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
c.get_log_page.nsid = cpu_to_le32(nsid);
c.get_log_page.lid = log_page;
c.get_log_page.lsp = lsp;
c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
@@ -2293,12 +2306,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
size_t size)
{
return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
}
static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
{
int ret;
@@ -2309,8 +2316,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
if (!ctrl->effects)
return 0;
ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
sizeof(*ctrl->effects));
ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
ctrl->effects, sizeof(*ctrl->effects), 0);
if (ret) {
kfree(ctrl->effects);
ctrl->effects = NULL;
@@ -2401,6 +2408,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
if (id->rtd3e) {
/* us -> s */
@@ -2460,8 +2468,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
ret = nvme_mpath_init(ctrl, id);
kfree(id);
if (ret < 0)
return ret;
if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
@@ -2680,6 +2692,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
#endif
NULL,
};
@@ -2702,6 +2718,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0;
}
#ifdef CONFIG_NVME_MULTIPATH
if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
return 0;
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
return 0;
}
#endif
return a->mode;
}
@@ -3075,8 +3099,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_get_ctrl(ctrl);
kfree(id);
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group))
@@ -3086,8 +3108,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name);
nvme_mpath_add_disk(ns->head);
nvme_mpath_add_disk(ns, id);
nvme_fault_inject_init(ns);
kfree(id);
return;
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
@@ -3229,7 +3253,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -3346,9 +3371,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
dev_warn(ctrl->device,
"Get FW SLOT INFO log error\n");
if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -3394,6 +3419,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
if (!ctrl->ana_log_buf)
break;
queue_work(nvme_wq, &ctrl->ana_work);
break;
#endif
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3426,6 +3458,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work);
@@ -3463,6 +3496,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
if (subsys) {
mutex_lock(&subsys->lock);
@@ -3499,6 +3533,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
if (ret < 0)
goto out;

View File

@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
{
if (ctrl->opts->max_reconnects != -1 &&
if (ctrl->opts->max_reconnects == -1 ||
ctrl->nr_reconnects < ctrl->opts->max_reconnects)
return true;

View File

@@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
nvme_req(rq)->ctrl = &ctrl->ctrl;
return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
}

View File

@@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
/* Set compacted version for upper layers */
geo->version = NVM_OCSSD_SPEC_20;
if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) {
pr_err("nvm: OCSSD version not supported (v%d.%d)\n",
geo->major_ver_id, geo->minor_ver_id);
return -EINVAL;
}
geo->num_ch = le16_to_cpu(id->num_grp);
geo->num_lun = le16_to_cpu(id->num_pu);
geo->all_luns = geo->num_ch * geo->num_lun;
@@ -583,7 +577,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
struct ppa_addr ppa;
size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
size_t log_pos, offset, len;
int ret, i;
int ret, i, max_len;
/*
* limit requests to maximum 256K to avoid issuing arbitrary large
* requests when the device does not specific a maximum transfer size.
*/
max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
/* Normalize lba address space to obtain log offset */
ppa.ppa = slba;
@@ -596,10 +596,11 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
while (left) {
len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9);
len = min_t(unsigned int, left, max_len);
ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
dev_meta, len, offset);
ret = nvme_get_log(ctrl, ns->head->ns_id,
NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
offset);
if (ret) {
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
break;
@@ -662,12 +663,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (rqd->bio) {
if (rqd->bio)
blk_init_request_from_bio(rq, rqd->bio);
} else {
else
rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
rq->__data_len = 0;
}
return rq;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017 Christoph Hellwig.
* Copyright (c) 2017-2018 Christoph Hellwig.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
}
/*
* If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status;
unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
nvme_reset_ctrl(ns->ctrl);
kblockd_schedule_work(&ns->head->requeue_work);
}
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
case NVME_SC_ANA_PERSISTENT_LOSS:
/*
* If we got back an ANA error we know the controller is alive,
* but not ready to serve this namespaces. The spec suggests
* we should update our general state here, but due to the fact
* that the admin and I/O queues are not serialized that is
* fundamentally racy. So instead just clear the current path,
* mark the the path as pending and kick of a re-read of the ANA
* log page ASAP.
*/
nvme_mpath_clear_current_path(ns);
if (ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work);
}
break;
default:
/*
* Reset the controller for any non-ANA error as we don't know
* what caused the error.
*/
nvme_reset_ctrl(ns->ctrl);
break;
}
bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{
if (!(req->cmd_flags & REQ_NVME_MPATH))
return false;
return blk_path_error(error);
kblockd_schedule_work(&ns->head->requeue_work);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem);
}
static const char *nvme_ana_state_names[] = {
[0] = "invalid state",
[NVME_ANA_OPTIMIZED] = "optimized",
[NVME_ANA_NONOPTIMIZED] = "non-optimized",
[NVME_ANA_INACCESSIBLE] = "inaccessible",
[NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
[NVME_ANA_CHANGE] = "change",
};
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns;
struct nvme_ns *ns, *fallback = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state == NVME_CTRL_LIVE) {
if (ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
rcu_assign_pointer(head->current_path, ns);
return ns;
case NVME_ANA_NONOPTIMIZED:
fallback = ns;
break;
default:
break;
}
}
return NULL;
if (fallback)
rcu_assign_pointer(head->current_path, fallback);
return fallback;
}
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return ns->ctrl->state == NVME_CTRL_LIVE &&
ns->ana_state == NVME_ANA_OPTIMIZED;
}
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head);
return ns;
}
@@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu);
if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
if (likely(ns && nvme_path_is_optimized(ns)))
found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx);
return found;
@@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
struct request_queue *q;
bool vwc = false;
mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -220,29 +273,232 @@ out:
return -ENOMEM;
}
void nvme_mpath_add_disk(struct nvme_ns_head *head)
static void nvme_mpath_set_live(struct nvme_ns *ns)
{
struct nvme_ns_head *head = ns->head;
lockdep_assert_held(&ns->head->lock);
if (!head->disk)
return;
mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n",
head->disk->disk_name);
dev_warn(&head->subsys->dev,
"failed to create id group.\n");
}
kblockd_schedule_work(&ns->head->requeue_work);
}
static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
void *))
{
void *base = ctrl->ana_log_buf;
size_t offset = sizeof(struct nvme_ana_rsp_hdr);
int error, i;
lockdep_assert_held(&ctrl->ana_lock);
for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
struct nvme_ana_group_desc *desc = base + offset;
u32 nr_nsids = le32_to_cpu(desc->nnsids);
size_t nsid_buf_size = nr_nsids * sizeof(__le32);
if (WARN_ON_ONCE(desc->grpid == 0))
return -EINVAL;
if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
return -EINVAL;
if (WARN_ON_ONCE(desc->state == 0))
return -EINVAL;
if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
return -EINVAL;
offset += sizeof(*desc);
if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
return -EINVAL;
error = cb(ctrl, desc, data);
if (error)
return error;
offset += nsid_buf_size;
if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
return -EINVAL;
}
return 0;
}
static inline bool nvme_state_is_live(enum nvme_ana_state state)
{
return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
}
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
enum nvme_ana_state old;
mutex_lock(&ns->head->lock);
old = ns->ana_state;
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
dev_info(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
nvme_ana_state_names[desc->state]);
if (desc->state == NVME_ANA_CHANGE)
(*nr_change_groups)++;
if (!nr_nsids)
return 0;
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
continue;
nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
WARN_ON_ONCE(n < nr_nsids);
return 0;
}
static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
{
u32 nr_change_groups = 0;
int error;
mutex_lock(&ctrl->ana_lock);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
groups_only ? NVME_ANA_LOG_RGO : 0,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
goto out_unlock;
}
error = nvme_parse_ana_log(ctrl, &nr_change_groups,
nvme_update_ana_state);
if (error)
goto out_unlock;
/*
* In theory we should have an ANATT timer per group as they might enter
* the change state at different times. But that is a lot of overhead
* just to protect against a target that keeps entering new changes
* states while never finishing previous ones. But we'll still
* eventually time out once all groups are in change state, so this
* isn't a big deal.
*
* We also double the ANATT value to provide some slack for transports
* or AEN processing overhead.
*/
if (nr_change_groups)
mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
else
del_timer_sync(&ctrl->anatt_timer);
out_unlock:
mutex_unlock(&ctrl->ana_lock);
return error;
}
static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
nvme_read_ana_log(ctrl, false);
}
static void nvme_anatt_timeout(struct timer_list *t)
{
struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
nvme_reset_ctrl(ctrl);
}
void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
if (!nvme_ctrl_use_ana(ctrl))
return;
del_timer_sync(&ctrl->anatt_timer);
cancel_work_sync(&ctrl->ana_work);
}
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
}
DEVICE_ATTR_RO(ana_grpid);
static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
}
DEVICE_ATTR_RO(ana_state);
static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
struct nvme_ns *ns = data;
if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
nvme_update_ns_ana_state(desc, ns);
return -ENXIO; /* just break out of the loop */
}
return 0;
}
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
mutex_lock(&ns->ctrl->ana_lock);
ns->ana_grpid = le32_to_cpu(id->anagrpid);
nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
mutex_unlock(&ns->ctrl->ana_lock);
} else {
mutex_lock(&ns->head->lock);
ns->ana_state = NVME_ANA_OPTIMIZED;
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
mutex_unlock(&head->subsys->lock);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group);
del_gendisk(head->disk);
if (head->disk->flags & GENHD_FL_UP) {
sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group);
del_gendisk(head->disk);
}
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
@@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
int error;
if (!nvme_ctrl_use_ana(ctrl))
return 0;
ctrl->anacap = id->anacap;
ctrl->anatt = id->anatt;
ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
mutex_init(&ctrl->ana_lock);
timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
if (!(ctrl->anacap & (1 << 6)))
ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
dev_err(ctrl->device,
"ANA log page size (%zd) larger than MDTS (%d).\n",
ctrl->ana_log_size,
ctrl->max_hw_sectors << SECTOR_SHIFT);
dev_err(ctrl->device, "disabling ANA support.\n");
return 0;
}
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
if (!ctrl->ana_log_buf)
goto out;
error = nvme_read_ana_log(ctrl, true);
if (error)
goto out_free_ana_log_buf;
return 0;
out_free_ana_log_buf:
kfree(ctrl->ana_log_buf);
out:
return -ENOMEM;
}
void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
kfree(ctrl->ana_log_buf);
}

View File

@@ -102,6 +102,7 @@ struct nvme_request {
u8 retries;
u8 flags;
u16 status;
struct nvme_ctrl *ctrl;
};
/*
@@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req)
return blk_mq_rq_to_pdu(req);
}
static inline u16 nvme_req_qid(struct request *req)
{
if (!req->rq_disk)
return 0;
return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
}
/* The below value is the specific amount of delay needed before checking
* readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
* NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -175,6 +183,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
u32 vs;
@@ -197,6 +206,19 @@ struct nvme_ctrl {
struct work_struct fw_act_work;
unsigned long events;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
u8 anacap;
u8 anatt;
u32 anagrpmax;
u32 nanagrpid;
struct mutex ana_lock;
struct nvme_ana_rsp_hdr *ana_log_buf;
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
#endif
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@@ -261,6 +283,7 @@ struct nvme_ns_head {
struct bio_list requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
struct mutex lock;
#endif
struct list_head list;
struct srcu_struct srcu;
@@ -287,6 +310,10 @@ struct nvme_ns {
struct nvme_ctrl *ctrl;
struct request_queue *queue;
struct gendisk *disk;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_ana_state ana_state;
u32 ana_grpid;
#endif
struct list_head siblings;
struct nvm_dev *ndev;
struct kref kref;
@@ -299,8 +326,9 @@ struct nvme_ns {
bool ext;
u8 pi_type;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
u16 noiob;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -356,14 +384,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
return (sector >> (ns->lba_shift - 9));
}
static inline void nvme_cleanup_cmd(struct request *req)
{
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
kfree(page_address(req->special_vec.bv_page) +
req->special_vec.bv_offset);
}
}
static inline void nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
{
@@ -420,6 +440,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
void nvme_cleanup_cmd(struct request *req);
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
@@ -435,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 log_page, void *log, size_t size, u64 offset);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset);
extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req);
bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -468,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
#else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
return false;
}
/*
* Without the multipath code enabled, multiple controller per subsystems are
* visible as devices and thus we cannot use the subsystem instance.
@@ -482,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
static inline void nvme_failover_req(struct request *req)
{
}
static inline bool nvme_req_needs_failover(struct request *req,
blk_status_t error)
{
return false;
}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
}
@@ -495,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
return 0;
}
static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
struct nvme_id_ns *id)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -507,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
return 0;
}
static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
}
static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM

View File

@@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
nvme_req(req)->ctrl = &dev->ctrl;
return 0;
}
@@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
mempool_free(iod->sg, dev->iod_mempool);
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
if (be32_to_cpu(pi->ref_tag) == v)
pi->ref_tag = cpu_to_be32(p);
}
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
if (be32_to_cpu(pi->ref_tag) == p)
pi->ref_tag = cpu_to_be32(v);
}
/**
* nvme_dif_remap - remaps ref tags to bip seed and physical lba
*
* The virtual start sector is the one that was originally submitted by the
* block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
* start sector may be different. Remap protection information to match the
* physical LBA on writes, and back to the original seed on reads.
*
* Type 0 and 3 do not have a ref tag, so no remapping required.
*/
static void nvme_dif_remap(struct request *req,
void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
struct nvme_ns *ns = req->rq_disk->private_data;
struct bio_integrity_payload *bip;
struct t10_pi_tuple *pi;
void *p, *pmap;
u32 i, nlb, ts, phys, virt;
if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
return;
bip = bio_integrity(req->bio);
if (!bip)
return;
pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
p = pmap;
virt = bip_get_seed(bip);
phys = nvme_block_nr(ns, blk_rq_pos(req));
nlb = (blk_rq_bytes(req) >> ns->lba_shift);
ts = ns->disk->queue->integrity.tuple_size;
for (i = 0; i < nlb; i++, virt++, phys++) {
pi = (struct t10_pi_tuple *)p;
dif_swap(phys, virt, pi);
p += ts;
}
kunmap_atomic(pmap);
}
#else /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_dif_remap(struct request *req,
void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
}
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
#endif
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
int i;
@@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
goto out_unmap;
if (req_op(req) == REQ_OP_WRITE)
nvme_dif_remap(req, nvme_dif_prep);
if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
goto out_unmap;
}
@@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
if (iod->nents) {
dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
if (blk_integrity_rq(req)) {
if (req_op(req) == REQ_OP_READ)
nvme_dif_remap(req, nvme_dif_complete);
if (blk_integrity_rq(req))
dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
}
}
nvme_cleanup_cmd(req);

View File

@@ -40,13 +40,14 @@
#define NVME_RDMA_MAX_SEGMENTS 256
#define NVME_RDMA_MAX_INLINE_SEGMENTS 1
#define NVME_RDMA_MAX_INLINE_SEGMENTS 4
struct nvme_rdma_device {
struct ib_device *dev;
struct ib_pd *pd;
struct kref ref;
struct list_head entry;
unsigned int num_inline_segments;
};
struct nvme_rdma_qe {
@@ -117,6 +118,7 @@ struct nvme_rdma_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
bool use_inline_data;
};
static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
/* +1 for drain */
init_attr.cap.max_recv_wr = queue->queue_size + 1;
init_attr.cap.max_recv_sge = 1;
init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
init_attr.qp_type = IB_QPT_RC;
init_attr.send_cq = queue->ib_cq;
@@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
struct ib_device *ibdev = dev->dev;
int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl;
ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
DMA_TO_DEVICE);
if (ret)
@@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
goto out_free_pd;
}
ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
ndev->dev->attrs.max_sge - 1);
list_add(&ndev->entry, &device_list);
out_unlock:
mutex_unlock(&device_list_mutex);
@@ -868,6 +873,31 @@ out_free_io_queues:
return ret;
}
static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
&ctrl->ctrl);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
&ctrl->ctrl);
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
}
}
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
}
}
static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_rdma_ctrl, reconnect_work);
int ret = -EINVAL;
bool changed;
int ret;
++ctrl->ctrl.nr_reconnects;
ret = nvme_rdma_configure_admin_queue(ctrl, false);
ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
goto requeue;
return ret;
if (ctrl->ctrl.icdoff) {
dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
goto destroy_admin;
}
if (!(ctrl->ctrl.sgls & (1 << 2))) {
dev_err(ctrl->ctrl.device,
"Mandatory keyed sgls are not supported!\n");
goto destroy_admin;
}
if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
dev_warn(ctrl->ctrl.device,
"queue_size %zu > ctrl sqsize %u, clamping down\n",
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
}
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
dev_warn(ctrl->ctrl.device,
"sqsize %u > ctrl maxcmd %u, clamping down\n",
ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
}
if (ctrl->ctrl.sgls & (1 << 20))
ctrl->use_inline_data = true;
if (ctrl->ctrl.queue_count > 1) {
ret = nvme_rdma_configure_io_queues(ctrl, false);
ret = nvme_rdma_configure_io_queues(ctrl, new);
if (ret)
goto destroy_admin;
}
@@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
if (!changed) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
return;
ret = -EINVAL;
goto destroy_io;
}
nvme_start_ctrl(&ctrl->ctrl);
return 0;
destroy_io:
if (ctrl->ctrl.queue_count > 1)
nvme_rdma_destroy_io_queues(ctrl, new);
destroy_admin:
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_rdma_destroy_admin_queue(ctrl, new);
return ret;
}
static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_rdma_ctrl, reconnect_work);
++ctrl->ctrl.nr_reconnects;
if (nvme_rdma_setup_ctrl(ctrl, false))
goto requeue;
dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
ctrl->ctrl.nr_reconnects);
@@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
return;
destroy_admin:
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_rdma_destroy_admin_queue(ctrl, false);
requeue:
dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
ctrl->ctrl.nr_reconnects);
@@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl, err_work);
nvme_stop_keep_alive(&ctrl->ctrl);
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, false);
}
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl, false);
/*
* queues are not a live anymore, so restart the queues to fail fast
* new IO
*/
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c)
}
static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
struct nvme_rdma_request *req, struct nvme_command *c)
struct nvme_rdma_request *req, struct nvme_command *c,
int count)
{
struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
struct scatterlist *sgl = req->sg_table.sgl;
struct ib_sge *sge = &req->sge[1];
u32 len = 0;
int i;
req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
req->sge[1].length = sg_dma_len(req->sg_table.sgl);
req->sge[1].lkey = queue->device->pd->local_dma_lkey;
for (i = 0; i < count; i++, sgl++, sge++) {
sge->addr = sg_dma_address(sgl);
sge->length = sg_dma_len(sgl);
sge->lkey = queue->device->pd->local_dma_lkey;
len += sge->length;
}
sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
sg->length = cpu_to_le32(len);
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
req->num_sge++;
req->num_sge += count;
return 0;
}
@@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
goto out_free_table;
}
if (count == 1) {
if (count <= dev->num_inline_segments) {
if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
queue->ctrl->use_inline_data &&
blk_rq_payload_bytes(rq) <=
nvme_rdma_inline_data_size(queue)) {
ret = nvme_rdma_map_sg_inline(queue, req, c);
ret = nvme_rdma_map_sg_inline(queue, req, c, count);
goto out;
}
if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
ret = nvme_rdma_map_sg_single(queue, req, c);
goto out;
}
@@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
nvme_rdma_destroy_queue_ib(queue);
/* fall through */
case RDMA_CM_EVENT_ADDR_ERROR:
dev_dbg(queue->ctrl->ctrl.device,
"CM error event %d\n", ev->event);
@@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, shutdown);
}
nvme_rdma_teardown_io_queues(ctrl, shutdown);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, shutdown);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl =
container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
int ret;
bool changed;
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
return;
}
ret = nvme_rdma_configure_admin_queue(ctrl, false);
if (ret)
if (nvme_rdma_setup_ctrl(ctrl, false))
goto out_fail;
if (ctrl->ctrl.queue_count > 1) {
ret = nvme_rdma_configure_io_queues(ctrl, false);
if (ret)
goto out_fail;
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
if (!changed) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
return;
}
nvme_start_ctrl(&ctrl->ctrl);
return;
out_fail:
@@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
WARN_ON_ONCE(!changed);
ret = nvme_rdma_configure_admin_queue(ctrl, true);
ret = nvme_rdma_setup_ctrl(ctrl, true);
if (ret)
goto out_uninit_ctrl;
/* sanity check icdoff */
if (ctrl->ctrl.icdoff) {
dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
ret = -EINVAL;
goto out_remove_admin_queue;
}
/* sanity check keyed sgls */
if (!(ctrl->ctrl.sgls & (1 << 2))) {
dev_err(ctrl->ctrl.device,
"Mandatory keyed sgls are not supported!\n");
ret = -EINVAL;
goto out_remove_admin_queue;
}
/* only warn if argument is too large here, will clamp later */
if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
dev_warn(ctrl->ctrl.device,
"queue_size %zu > ctrl sqsize %u, clamping down\n",
opts->queue_size, ctrl->ctrl.sqsize + 1);
}
/* warn if maxcmd is lower than sqsize+1 */
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
dev_warn(ctrl->ctrl.device,
"sqsize %u > ctrl maxcmd %u, clamping down\n",
ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
}
if (opts->nr_io_queues) {
ret = nvme_rdma_configure_io_queues(ctrl, true);
if (ret)
goto out_remove_admin_queue;
}
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
@@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
nvme_start_ctrl(&ctrl->ctrl);
return &ctrl->ctrl;
out_remove_admin_queue:
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_rdma_destroy_admin_queue(ctrl, true);
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);

View File

@@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
return nvme_trace_common(p, cdw10);
}
}
const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
{
const char *ret = trace_seq_buffer_ptr(p);
if (*name)
trace_seq_printf(p, "disk=%s, ", name);
trace_seq_putc(p, 0);
return ret;
}

View File

@@ -50,13 +50,8 @@
nvme_admin_opcode_name(nvme_admin_security_recv), \
nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
#define __parse_nvme_admin_cmd(opcode, cdw10) \
nvme_trace_parse_admin_cmd(p, opcode, cdw10)
#define nvme_opcode_name(opcode) { opcode, #opcode }
#define show_opcode_name(val) \
#define show_nvm_opcode_name(val) \
__print_symbolic(val, \
nvme_opcode_name(nvme_cmd_flush), \
nvme_opcode_name(nvme_cmd_write), \
@@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
nvme_opcode_name(nvme_cmd_resv_acquire), \
nvme_opcode_name(nvme_cmd_resv_release))
#define show_opcode_name(qid, opcode) \
(qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
#define __parse_nvme_cmd(opcode, cdw10) \
nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
u8 *cdw10);
TRACE_EVENT(nvme_setup_admin_cmd,
TP_PROTO(struct nvme_command *cmd),
TP_ARGS(cmd),
#define parse_nvme_cmd(qid, opcode, cdw10) \
(qid ? \
nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \
nvme_trace_parse_admin_cmd(p, opcode, cdw10))
const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
#define __print_disk_name(name) \
nvme_trace_disk_name(p, name)
#ifndef TRACE_HEADER_MULTI_READ
static inline void __assign_disk_name(char *name, struct gendisk *disk)
{
if (disk)
memcpy(name, disk->disk_name, DISK_NAME_LEN);
else
memset(name, 0, DISK_NAME_LEN);
}
#endif
TRACE_EVENT(nvme_setup_cmd,
TP_PROTO(struct request *req, struct nvme_command *cmd),
TP_ARGS(req, cmd),
TP_STRUCT__entry(
__field(u8, opcode)
__field(u8, flags)
__field(u16, cid)
__field(u64, metadata)
__array(u8, cdw10, 24)
__array(char, disk, DISK_NAME_LEN)
__field(int, ctrl_id)
__field(int, qid)
__field(u8, opcode)
__field(u8, flags)
__field(u16, cid)
__field(u32, nsid)
__field(u64, metadata)
__array(u8, cdw10, 24)
),
TP_fast_assign(
__entry->opcode = cmd->common.opcode;
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->metadata = le64_to_cpu(cmd->common.metadata);
memcpy(__entry->cdw10, cmd->common.cdw10,
sizeof(__entry->cdw10));
__entry->ctrl_id = nvme_req(req)->ctrl->instance;
__entry->qid = nvme_req_qid(req);
__entry->opcode = cmd->common.opcode;
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->nsid = le32_to_cpu(cmd->common.nsid);
__entry->metadata = le64_to_cpu(cmd->common.metadata);
__assign_disk_name(__entry->disk, req->rq_disk);
memcpy(__entry->cdw10, cmd->common.cdw10,
sizeof(__entry->cdw10));
),
TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->cid, __entry->flags, __entry->metadata,
show_admin_opcode_name(__entry->opcode),
__parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
);
TRACE_EVENT(nvme_setup_nvm_cmd,
TP_PROTO(int qid, struct nvme_command *cmd),
TP_ARGS(qid, cmd),
TP_STRUCT__entry(
__field(int, qid)
__field(u8, opcode)
__field(u8, flags)
__field(u16, cid)
__field(u32, nsid)
__field(u64, metadata)
__array(u8, cdw10, 24)
),
TP_fast_assign(
__entry->qid = qid;
__entry->opcode = cmd->common.opcode;
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->nsid = le32_to_cpu(cmd->common.nsid);
__entry->metadata = le64_to_cpu(cmd->common.metadata);
memcpy(__entry->cdw10, cmd->common.cdw10,
sizeof(__entry->cdw10));
),
TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->qid, __entry->nsid, __entry->cid,
TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->nsid,
__entry->flags, __entry->metadata,
show_opcode_name(__entry->opcode),
__parse_nvme_cmd(__entry->opcode, __entry->cdw10))
show_opcode_name(__entry->qid, __entry->opcode),
parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
);
TRACE_EVENT(nvme_complete_rq,
TP_PROTO(struct request *req),
TP_ARGS(req),
TP_STRUCT__entry(
__field(int, qid)
__field(int, cid)
__field(u64, result)
__field(u8, retries)
__field(u8, flags)
__field(u16, status)
__array(char, disk, DISK_NAME_LEN)
__field(int, ctrl_id)
__field(int, qid)
__field(int, cid)
__field(u64, result)
__field(u8, retries)
__field(u8, flags)
__field(u16, status)
),
TP_fast_assign(
__entry->qid = req->q->id;
__entry->cid = req->tag;
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
__entry->retries = nvme_req(req)->retries;
__entry->flags = nvme_req(req)->flags;
__entry->status = nvme_req(req)->status;
__entry->ctrl_id = nvme_req(req)->ctrl->instance;
__entry->qid = nvme_req_qid(req);
__entry->cid = req->tag;
__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
__entry->retries = nvme_req(req)->retries;
__entry->flags = nvme_req(req)->flags;
__entry->status = nvme_req(req)->status;
__assign_disk_name(__entry->disk, req->rq_disk);
),
TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
__entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->result,
__entry->retries, __entry->flags, __entry->status)