Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block

Pull core block layer updates from Jens Axboe:
 "This is the main pull request for block storage for 4.15-rc1.

  Nothing out of the ordinary in here, and no API changes or anything
  like that. Just various new features for drivers, core changes, etc.
  In particular, this pull request contains:

   - A patch series from Bart, closing the whole on blk/scsi-mq queue
     quescing.

   - A series from Christoph, building towards hidden gendisks (for
     multipath) and ability to move bio chains around.

   - NVMe
        - Support for native multipath for NVMe (Christoph).
        - Userspace notifications for AENs (Keith).
        - Command side-effects support (Keith).
        - SGL support (Chaitanya Kulkarni)
        - FC fixes and improvements (James Smart)
        - Lots of fixes and tweaks (Various)

   - bcache
        - New maintainer (Michael Lyle)
        - Writeback control improvements (Michael)
        - Various fixes (Coly, Elena, Eric, Liang, et al)

   - lightnvm updates, mostly centered around the pblk interface
     (Javier, Hans, and Rakesh).

   - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)

   - Writeback series that fix the much discussed hundreds of millions
     of sync-all units. This goes all the way, as discussed previously
     (me).

   - Fix for missing wakeup on writeback timer adjustments (Yafang
     Shao).

   - Fix laptop mode on blk-mq (me).

   - {mq,name} tupple lookup for IO schedulers, allowing us to have
     alias names. This means you can use 'deadline' on both !mq and on
     mq (where it's called mq-deadline). (me).

   - blktrace race fix, oopsing on sg load (me).

   - blk-mq optimizations (me).

   - Obscure waitqueue race fix for kyber (Omar).

   - NBD fixes (Josef).

   - Disable writeback throttling by default on bfq, like we do on cfq
     (Luca Miccio).

   - Series from Ming that enable us to treat flush requests on blk-mq
     like any other request. This is a really nice cleanup.

   - Series from Ming that improves merging on blk-mq with schedulers,
     getting us closer to flipping the switch on scsi-mq again.

   - BFQ updates (Paolo).

   - blk-mq atomic flags memory ordering fixes (Peter Z).

   - Loop cgroup support (Shaohua).

   - Lots of minor fixes from lots of different folks, both for core and
     driver code"

* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
  nvme: fix visibility of "uuid" ns attribute
  blk-mq: fixup some comment typos and lengths
  ide: ide-atapi: fix compile error with defining macro DEBUG
  blk-mq: improve tag waiting setup for non-shared tags
  brd: remove unused brd_mutex
  blk-mq: only run the hardware queue if IO is pending
  block: avoid null pointer dereference on null disk
  fs: guard_bio_eod() needs to consider partitions
  xtensa/simdisk: fix compile error
  nvme: expose subsys attribute to sysfs
  nvme: create 'slaves' and 'holders' entries for hidden controllers
  block: create 'slaves' and 'holders' entries for hidden gendisks
  nvme: also expose the namespace identification sysfs files for mpath nodes
  nvme: implement multipath access to nvme subsystems
  nvme: track shared namespaces
  nvme: introduce a nvme_ns_ids structure
  nvme: track subsystems
  block, nvme: Introduce blk_mq_req_flags_t
  block, scsi: Make SCSI quiesce and resume work reliably
  block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
  ...
This commit is contained in:
Linus Torvalds
2017-11-14 15:32:19 -08:00
131 changed files with 5485 additions and 3104 deletions

View File

@@ -13,7 +13,6 @@
*/
#include <linux/aer.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
@@ -26,12 +25,9 @@
#include <linux/mutex.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/poison.h>
#include <linux/t10-pi.h>
#include <linux/timer.h>
#include <linux/types.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <asm/unaligned.h>
#include <linux/sed-opal.h>
#include "nvme.h"
@@ -39,11 +35,7 @@
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
/*
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
*/
#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS)
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
@@ -57,6 +49,12 @@ module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
"Use SGLs when average request segment size is larger or equal to "
"this size. Use 0 to disable SGLs.");
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
@@ -178,6 +176,7 @@ struct nvme_queue {
struct nvme_iod {
struct nvme_request req;
struct nvme_queue *nvmeq;
bool use_sgl;
int aborted;
int npages; /* In the PRP list. 0 means small pool in use */
int nents; /* Used in scatterlist */
@@ -331,17 +330,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
unsigned int size, unsigned int nseg)
/*
* Calculates the number of pages needed for the SGL segments. For example a 4k
* page can accommodate 256 SGL descriptors.
*/
static int nvme_pci_npages_sgl(unsigned int num_seg)
{
return sizeof(__le64 *) * nvme_npages(size, dev) +
sizeof(struct scatterlist) * nseg;
return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
}
static unsigned int nvme_cmd_size(struct nvme_dev *dev)
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
unsigned int size, unsigned int nseg, bool use_sgl)
{
return sizeof(struct nvme_iod) +
nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
size_t alloc_size;
if (use_sgl)
alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
else
alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
return alloc_size + sizeof(struct scatterlist) * nseg;
}
static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
{
unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
NVME_INT_BYTES(dev), NVME_INT_PAGES,
use_sgl);
return sizeof(struct nvme_iod) + alloc_size;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -425,10 +442,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
nvmeq->sq_tail = tail;
}
static __le64 **iod_list(struct request *req)
static void **nvme_pci_iod_list(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
@@ -438,7 +455,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
unsigned int size = blk_rq_payload_bytes(rq);
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
iod->use_sgl);
iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
if (!iod->sg)
return BLK_STS_RESOURCE;
} else {
@@ -456,18 +476,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / 8 - 1;
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
__le64 **list = iod_list(req);
dma_addr_t prp_dma = iod->first_dma;
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
dma_addr);
for (i = 0; i < iod->npages; i++) {
__le64 *prp_list = list[i];
dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
void *addr = nvme_pci_iod_list(req)[i];
if (iod->use_sgl) {
struct nvme_sgl_desc *sg_list = addr;
next_dma_addr =
le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
} else {
__le64 *prp_list = addr;
next_dma_addr = le64_to_cpu(prp_list[last_prp]);
}
dma_pool_free(dev->prp_page_pool, addr, dma_addr);
dma_addr = next_dma_addr;
}
if (iod->sg != iod->inline_sg)
@@ -555,7 +588,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
@@ -566,14 +600,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
u32 page_size = dev->ctrl.page_size;
int offset = dma_addr & (page_size - 1);
__le64 *prp_list;
__le64 **list = iod_list(req);
void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
iod->use_sgl = false;
length -= (page_size - offset);
if (length <= 0) {
iod->first_dma = 0;
return BLK_STS_OK;
goto done;
}
dma_len -= (page_size - offset);
@@ -587,7 +623,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
if (length <= page_size) {
iod->first_dma = dma_addr;
return BLK_STS_OK;
goto done;
}
nprps = DIV_ROUND_UP(length, page_size);
@@ -634,6 +670,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
dma_len = sg_dma_len(sg);
}
done:
cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
bad_sgl:
@@ -643,6 +683,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
return BLK_STS_IOERR;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
struct scatterlist *sg)
{
sge->addr = cpu_to_le64(sg_dma_address(sg));
sge->length = cpu_to_le32(sg_dma_len(sg));
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
dma_addr_t dma_addr, int entries)
{
sge->addr = cpu_to_le64(dma_addr);
if (entries < SGES_PER_PAGE) {
sge->length = cpu_to_le32(entries * sizeof(*sge));
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
} else {
sge->length = cpu_to_le32(PAGE_SIZE);
sge->type = NVME_SGL_FMT_SEG_DESC << 4;
}
}
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int length = blk_rq_payload_bytes(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sg;
int entries = iod->nents, i = 0;
dma_addr_t sgl_dma;
iod->use_sgl = true;
/* setting the transfer type as SGL */
cmd->flags = NVME_CMD_SGL_METABUF;
if (length == sg_dma_len(sg)) {
nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
return BLK_STS_OK;
}
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
pool = dev->prp_small_pool;
iod->npages = 0;
} else {
pool = dev->prp_page_pool;
iod->npages = 1;
}
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list) {
iod->npages = -1;
return BLK_STS_RESOURCE;
}
nvme_pci_iod_list(req)[0] = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
do {
if (i == SGES_PER_PAGE) {
struct nvme_sgl_desc *old_sg_desc = sg_list;
struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
return BLK_STS_RESOURCE;
i = 0;
nvme_pci_iod_list(req)[iod->npages++] = sg_list;
sg_list[i++] = *link;
nvme_pci_sgl_set_seg(link, sgl_dma, entries);
}
nvme_pci_sgl_set_data(&sg_list[i++], sg);
length -= sg_dma_len(sg);
sg = sg_next(sg);
entries--;
} while (length > 0);
WARN_ON(entries > 0);
return BLK_STS_OK;
}
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
unsigned int avg_seg_size;
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
blk_rq_nr_phys_segments(req));
if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
return false;
if (!iod->nvmeq->qid)
return false;
if (!sgl_threshold || avg_seg_size < sgl_threshold)
return false;
return true;
}
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
@@ -662,7 +806,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
DMA_ATTR_NO_WARN))
goto out;
ret = nvme_setup_prps(dev, req);
if (nvme_pci_use_sgls(dev, req))
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap;
@@ -682,8 +830,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
goto out_unmap;
}
cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
if (blk_integrity_rq(req))
cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
return BLK_STS_OK;
@@ -804,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
* for them but rather special case them here.
*/
if (unlikely(nvmeq->qid == 0 &&
cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result);
return;
@@ -897,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return __nvme_poll(nvmeq, tag);
}
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
struct nvme_queue *nvmeq = dev->queues[0];
@@ -905,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
spin_lock_irq(&nvmeq->q_lock);
__nvme_submit_cmd(nvmeq, &c);
@@ -930,7 +1076,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
* Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
@@ -951,7 +1097,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
int flags = NVME_QUEUE_PHYS_CONTIG;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
* Note: we (ab)use the fact that the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
@@ -1372,14 +1518,10 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.ops = &nvme_mq_admin_ops;
dev->admin_tagset.nr_hw_queues = 1;
/*
* Subtract one to leave an empty queue entry for 'Full Queue'
* condition. See NVM-Express 1.2 specification, section 4.1.2.
*/
dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
@@ -1906,7 +2048,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
dev->tagset.cmd_size = max(dev->tagset.cmd_size,
nvme_pci_cmd_size(dev, true));
}
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
@@ -2132,9 +2278,9 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
{
dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
kref_get(&dev->ctrl.kref);
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
if (!schedule_work(&dev->remove_work))
if (!queue_work(nvme_wq, &dev->remove_work))
nvme_put_ctrl(&dev->ctrl);
}
@@ -2557,6 +2703,7 @@ static int __init nvme_init(void)
static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
flush_workqueue(nvme_wq);
_nvme_check_size();
}