Merge branch 'for-4.12/block' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe:

 - Add BFQ IO scheduler under the new blk-mq scheduling framework. BFQ
   was initially a fork of CFQ, but subsequently changed to implement
   fairness based on B-WF2Q+, a modified variant of WF2Q. BFQ is meant
   to be used on desktop type single drives, providing good fairness.
   From Paolo.

 - Add Kyber IO scheduler. This is a full multiqueue aware scheduler,
   using a scalable token based algorithm that throttles IO based on
   live completion IO stats, similary to blk-wbt. From Omar.

 - A series from Jan, moving users to separately allocated backing
   devices. This continues the work of separating backing device life
   times, solving various problems with hot removal.

 - A series of updates for lightnvm, mostly from Javier. Includes a
   'pblk' target that exposes an open channel SSD as a physical block
   device.

 - A series of fixes and improvements for nbd from Josef.

 - A series from Omar, removing queue sharing between devices on mostly
   legacy drivers. This helps us clean up other bits, if we know that a
   queue only has a single device backing. This has been overdue for
   more than a decade.

 - Fixes for the blk-stats, and improvements to unify the stats and user
   windows. This both improves blk-wbt, and enables other users to
   register a need to receive IO stats for a device. From Omar.

 - blk-throttle improvements from Shaohua. This provides a scalable
   framework for implementing scalable priotization - particularly for
   blk-mq, but applicable to any type of block device. The interface is
   marked experimental for now.

 - Bucketized IO stats for IO polling from Stephen Bates. This improves
   efficiency of polled workloads in the presence of mixed block size
   IO.

 - A few fixes for opal, from Scott.

 - A few pulls for NVMe, including a lot of fixes for NVMe-over-fabrics.
   From a variety of folks, mostly Sagi and James Smart.

 - A series from Bart, improving our exposed info and capabilities from
   the blk-mq debugfs support.

 - A series from Christoph, cleaning up how handle WRITE_ZEROES.

 - A series from Christoph, cleaning up the block layer handling of how
   we track errors in a request. On top of being a nice cleanup, it also
   shrinks the size of struct request a bit.

 - Removal of mg_disk and hd (sorry Linus) by Christoph. The former was
   never used by platforms, and the latter has outlived it's usefulness.

 - Various little bug fixes and cleanups from a wide variety of folks.

* 'for-4.12/block' of git://git.kernel.dk/linux-block: (329 commits)
  block: hide badblocks attribute by default
  blk-mq: unify hctx delay_work and run_work
  block: add kblock_mod_delayed_work_on()
  blk-mq: unify hctx delayed_run_work and run_work
  nbd: fix use after free on module unload
  MAINTAINERS: bfq: Add Paolo as maintainer for the BFQ I/O scheduler
  blk-mq-sched: alloate reserved tags out of normal pool
  mtip32xx: use runtime tag to initialize command header
  scsi: Implement blk_mq_ops.show_rq()
  blk-mq: Add blk_mq_ops.show_rq()
  blk-mq: Show operation, cmd_flags and rq_flags names
  blk-mq: Make blk_flags_show() callers append a newline character
  blk-mq: Move the "state" debugfs attribute one level down
  blk-mq: Unregister debugfs attributes earlier
  blk-mq: Only unregister hctxs for which registration succeeded
  blk-mq-debugfs: Rename functions for registering and unregistering the mq directory
  blk-mq: Let blk_mq_debugfs_register() look up the queue name
  blk-mq: Register <dev>/queue/mq after having registered <dev>/queue
  ide-pm: always pass 0 error to ide_complete_rq in ide_do_devset
  ide-pm: always pass 0 error to __blk_end_request_all
  ..
This commit is contained in:
Linus Torvalds
2017-05-01 10:39:57 -07:00
255 changed files with 24703 additions and 6212 deletions

View File

@@ -418,6 +418,46 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(provisioning_mode);
static const char *zeroing_mode[] = {
[SD_ZERO_WRITE] = "write",
[SD_ZERO_WS] = "writesame",
[SD_ZERO_WS16_UNMAP] = "writesame_16_unmap",
[SD_ZERO_WS10_UNMAP] = "writesame_10_unmap",
};
static ssize_t
zeroing_mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct scsi_disk *sdkp = to_scsi_disk(dev);
return snprintf(buf, 20, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
}
static ssize_t
zeroing_mode_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct scsi_disk *sdkp = to_scsi_disk(dev);
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
sdkp->zeroing_mode = SD_ZERO_WRITE;
else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
sdkp->zeroing_mode = SD_ZERO_WS;
else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
else
return -EINVAL;
return count;
}
static DEVICE_ATTR_RW(zeroing_mode);
static ssize_t
max_medium_access_timeouts_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -496,6 +536,7 @@ static struct attribute *sd_disk_attrs[] = {
&dev_attr_app_tag_own.attr,
&dev_attr_thin_provisioning.attr,
&dev_attr_provisioning_mode.attr,
&dev_attr_zeroing_mode.attr,
&dev_attr_max_write_same_blocks.attr,
&dev_attr_max_medium_access_timeouts.attr,
NULL,
@@ -644,26 +685,11 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
unsigned int logical_block_size = sdkp->device->sector_size;
unsigned int max_blocks = 0;
q->limits.discard_zeroes_data = 0;
/*
* When LBPRZ is reported, discard alignment and granularity
* must be fixed to the logical block size. Otherwise the block
* layer will drop misaligned portions of the request which can
* lead to data corruption. If LBPRZ is not set, we honor the
* device preference.
*/
if (sdkp->lbprz) {
q->limits.discard_alignment = 0;
q->limits.discard_granularity = logical_block_size;
} else {
q->limits.discard_alignment = sdkp->unmap_alignment *
logical_block_size;
q->limits.discard_granularity =
max(sdkp->physical_block_size,
sdkp->unmap_granularity * logical_block_size);
}
q->limits.discard_alignment =
sdkp->unmap_alignment * logical_block_size;
q->limits.discard_granularity =
max(sdkp->physical_block_size,
sdkp->unmap_granularity * logical_block_size);
sdkp->provisioning_mode = mode;
switch (mode) {
@@ -681,19 +707,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
case SD_LBP_WS16:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS16_BLOCKS);
q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_WS10:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_ZERO:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
q->limits.discard_zeroes_data = 1;
break;
}
@@ -701,93 +724,122 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
/**
* sd_setup_discard_cmnd - unmap blocks on thinly provisioned device
* @sdp: scsi device to operate on
* @rq: Request to prepare
*
* Will issue either UNMAP or WRITE SAME(16) depending on preference
* indicated by target device.
**/
static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
unsigned int data_len = 24;
char *buf;
rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!rq->special_vec.bv_page)
return BLKPREP_DEFER;
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
cmd->cmd_len = 10;
cmd->cmnd[0] = UNMAP;
cmd->cmnd[8] = 24;
buf = page_address(rq->special_vec.bv_page);
put_unaligned_be16(6 + 16, &buf[0]);
put_unaligned_be16(16, &buf[2]);
put_unaligned_be64(sector, &buf[8]);
put_unaligned_be32(nr_sectors, &buf[16]);
cmd->allowed = SD_MAX_RETRIES;
cmd->transfersize = data_len;
rq->timeout = SD_TIMEOUT;
scsi_req(rq)->resid_len = data_len;
return scsi_init_io(cmd);
}
static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
u32 data_len = sdp->sector_size;
rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!rq->special_vec.bv_page)
return BLKPREP_DEFER;
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
cmd->cmd_len = 16;
cmd->cmnd[0] = WRITE_SAME_16;
if (unmap)
cmd->cmnd[1] = 0x8; /* UNMAP */
put_unaligned_be64(sector, &cmd->cmnd[2]);
put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
cmd->allowed = SD_MAX_RETRIES;
cmd->transfersize = data_len;
rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
scsi_req(rq)->resid_len = data_len;
return scsi_init_io(cmd);
}
static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
u32 data_len = sdp->sector_size;
rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!rq->special_vec.bv_page)
return BLKPREP_DEFER;
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
cmd->cmd_len = 10;
cmd->cmnd[0] = WRITE_SAME;
if (unmap)
cmd->cmnd[1] = 0x8; /* UNMAP */
put_unaligned_be32(sector, &cmd->cmnd[2]);
put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
cmd->allowed = SD_MAX_RETRIES;
cmd->transfersize = data_len;
rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
scsi_req(rq)->resid_len = data_len;
return scsi_init_io(cmd);
}
static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
struct scsi_device *sdp = cmd->device;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
unsigned int nr_sectors = blk_rq_sectors(rq);
unsigned int len;
int ret;
char *buf;
struct page *page;
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!page)
return BLKPREP_DEFER;
switch (sdkp->provisioning_mode) {
case SD_LBP_UNMAP:
buf = page_address(page);
cmd->cmd_len = 10;
cmd->cmnd[0] = UNMAP;
cmd->cmnd[8] = 24;
put_unaligned_be16(6 + 16, &buf[0]);
put_unaligned_be16(16, &buf[2]);
put_unaligned_be64(sector, &buf[8]);
put_unaligned_be32(nr_sectors, &buf[16]);
len = 24;
break;
case SD_LBP_WS16:
cmd->cmd_len = 16;
cmd->cmnd[0] = WRITE_SAME_16;
cmd->cmnd[1] = 0x8; /* UNMAP */
put_unaligned_be64(sector, &cmd->cmnd[2]);
put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
len = sdkp->device->sector_size;
break;
case SD_LBP_WS10:
case SD_LBP_ZERO:
cmd->cmd_len = 10;
cmd->cmnd[0] = WRITE_SAME;
if (sdkp->provisioning_mode == SD_LBP_WS10)
cmd->cmnd[1] = 0x8; /* UNMAP */
put_unaligned_be32(sector, &cmd->cmnd[2]);
put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
len = sdkp->device->sector_size;
break;
default:
ret = BLKPREP_INVALID;
goto out;
if (!(rq->cmd_flags & REQ_NOUNMAP)) {
switch (sdkp->zeroing_mode) {
case SD_ZERO_WS16_UNMAP:
return sd_setup_write_same16_cmnd(cmd, true);
case SD_ZERO_WS10_UNMAP:
return sd_setup_write_same10_cmnd(cmd, true);
}
}
rq->timeout = SD_TIMEOUT;
cmd->transfersize = len;
cmd->allowed = SD_MAX_RETRIES;
rq->special_vec.bv_page = page;
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = len;
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
scsi_req(rq)->resid_len = len;
ret = scsi_init_io(cmd);
out:
if (ret != BLKPREP_OK)
__free_page(page);
return ret;
if (sdp->no_write_same)
return BLKPREP_INVALID;
if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
return sd_setup_write_same16_cmnd(cmd, false);
return sd_setup_write_same10_cmnd(cmd, false);
}
static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -816,9 +868,20 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
sdkp->max_ws_blocks = 0;
}
if (sdkp->lbprz && sdkp->lbpws)
sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
else if (sdkp->lbprz && sdkp->lbpws10)
sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
else if (sdkp->max_ws_blocks)
sdkp->zeroing_mode = SD_ZERO_WS;
else
sdkp->zeroing_mode = SD_ZERO_WRITE;
out:
blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
(logical_block_size >> 9));
blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
(logical_block_size >> 9));
}
/**
@@ -1155,7 +1218,20 @@ static int sd_init_command(struct scsi_cmnd *cmd)
switch (req_op(rq)) {
case REQ_OP_DISCARD:
return sd_setup_discard_cmnd(cmd);
switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
case SD_LBP_UNMAP:
return sd_setup_unmap_cmnd(cmd);
case SD_LBP_WS16:
return sd_setup_write_same16_cmnd(cmd, true);
case SD_LBP_WS10:
return sd_setup_write_same10_cmnd(cmd, true);
case SD_LBP_ZERO:
return sd_setup_write_same10_cmnd(cmd, false);
default:
return BLKPREP_INVALID;
}
case REQ_OP_WRITE_ZEROES:
return sd_setup_write_zeroes_cmnd(cmd);
case REQ_OP_WRITE_SAME:
return sd_setup_write_same_cmnd(cmd);
case REQ_OP_FLUSH:
@@ -1795,6 +1871,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
switch (req_op(req)) {
case REQ_OP_DISCARD:
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
if (!result) {
@@ -2768,7 +2845,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
sd_config_discard(sdkp, SD_LBP_WS16);
} else { /* LBP VPD page tells us what to use */
if (sdkp->lbpu && sdkp->max_unmap_blocks && !sdkp->lbprz)
if (sdkp->lbpu && sdkp->max_unmap_blocks)
sd_config_discard(sdkp, SD_LBP_UNMAP);
else if (sdkp->lbpws)
sd_config_discard(sdkp, SD_LBP_WS16);