Merge tag 'for-4.20/block-20181021' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: "This is the main pull request for block changes for 4.20. This contains: - Series enabling runtime PM for blk-mq (Bart). - Two pull requests from Christoph for NVMe, with items such as; - Better AEN tracking - Multipath improvements - RDMA fixes - Rework of FC for target removal - Fixes for issues identified by static checkers - Fabric cleanups, as prep for TCP transport - Various cleanups and bug fixes - Block merging cleanups (Christoph) - Conversion of drivers to generic DMA mapping API (Christoph) - Series fixing ref count issues with blkcg (Dennis) - Series improving BFQ heuristics (Paolo, et al) - Series improving heuristics for the Kyber IO scheduler (Omar) - Removal of dangerous bio_rewind_iter() API (Ming) - Apply single queue IPI redirection logic to blk-mq (Ming) - Set of fixes and improvements for bcache (Coly et al) - Series closing a hotplug race with sysfs group attributes (Hannes) - Set of patches for lightnvm: - pblk trace support (Hans) - SPDX license header update (Javier) - Tons of refactoring patches to cleanly abstract the 1.2 and 2.0 specs behind a common core interface. (Javier, Matias) - Enable pblk to use a common interface to retrieve chunk metadata (Matias) - Bug fixes (Various) - Set of fixes and updates to the blk IO latency target (Josef) - blk-mq queue number updates fixes (Jianchao) - Convert a bunch of drivers from the old legacy IO interface to blk-mq. This will conclude with the removal of the legacy IO interface itself in 4.21, with the rest of the drivers (me, Omar) - Removal of the DAC960 driver. The SCSI tree will introduce two replacement drivers for this (Hannes)" * tag 'for-4.20/block-20181021' of git://git.kernel.dk/linux-block: (204 commits) block: setup bounce bio_sets properly blkcg: reassociate bios when make_request() is called recursively blkcg: fix edge case for blk_get_rl() under memory pressure nvme-fabrics: move controller options matching to fabrics nvme-rdma: always have a valid trsvcid mtip32xx: fully switch to the generic DMA API rsxx: switch to the generic DMA API umem: switch to the generic DMA API sx8: switch to the generic DMA API sx8: remove dead IF_64BIT_DMA_IS_POSSIBLE code skd: switch to the generic DMA API ubd: remove use of blk_rq_map_sg nvme-pci: remove duplicate check drivers/block: Remove DAC960 driver nvme-pci: fix hot removal during error handling nvmet-fcloop: suppress a compiler warning nvme-core: make implicit seed truncation explicit nvmet-fc: fix kernel-doc headers nvme-fc: rework the request initialization code nvme-fc: introduce struct nvme_fcp_op_w_sgl ...
This commit is contained in:
@@ -74,7 +74,6 @@ config BLK_DEV_BSG
|
||||
|
||||
config BLK_DEV_BSGLIB
|
||||
bool "Block layer SG support v4 helper lib"
|
||||
default n
|
||||
select BLK_DEV_BSG
|
||||
select BLK_SCSI_REQUEST
|
||||
help
|
||||
@@ -107,7 +106,6 @@ config BLK_DEV_ZONED
|
||||
config BLK_DEV_THROTTLING
|
||||
bool "Block layer bio throttling support"
|
||||
depends on BLK_CGROUP=y
|
||||
default n
|
||||
---help---
|
||||
Block layer bio throttling support. It can be used to limit
|
||||
the IO rate to a device. IO rate policies are per cgroup and
|
||||
@@ -119,7 +117,6 @@ config BLK_DEV_THROTTLING
|
||||
config BLK_DEV_THROTTLING_LOW
|
||||
bool "Block throttling .low limit interface support (EXPERIMENTAL)"
|
||||
depends on BLK_DEV_THROTTLING
|
||||
default n
|
||||
---help---
|
||||
Add .low limit interface for block throttling. The low limit is a best
|
||||
effort limit to prioritize cgroups. Depending on the setting, the limit
|
||||
@@ -130,7 +127,6 @@ config BLK_DEV_THROTTLING_LOW
|
||||
|
||||
config BLK_CMDLINE_PARSER
|
||||
bool "Block device command line partition parser"
|
||||
default n
|
||||
---help---
|
||||
Enabling this option allows you to specify the partition layout from
|
||||
the kernel boot args. This is typically of use for embedded devices
|
||||
@@ -141,7 +137,6 @@ config BLK_CMDLINE_PARSER
|
||||
|
||||
config BLK_WBT
|
||||
bool "Enable support for block device writeback throttling"
|
||||
default n
|
||||
---help---
|
||||
Enabling this option enables the block layer to throttle buffered
|
||||
background writeback from the VM, making it more smooth and having
|
||||
@@ -152,7 +147,6 @@ config BLK_WBT
|
||||
config BLK_CGROUP_IOLATENCY
|
||||
bool "Enable support for latency based cgroup IO protection"
|
||||
depends on BLK_CGROUP=y
|
||||
default n
|
||||
---help---
|
||||
Enabling this option enables the .latency interface for IO throttling.
|
||||
The IO controller will attempt to maintain average IO latencies below
|
||||
@@ -163,7 +157,6 @@ config BLK_CGROUP_IOLATENCY
|
||||
|
||||
config BLK_WBT_SQ
|
||||
bool "Single queue writeback throttling"
|
||||
default n
|
||||
depends on BLK_WBT
|
||||
---help---
|
||||
Enable writeback throttling by default on legacy single queue devices
|
||||
@@ -228,4 +221,7 @@ config BLK_MQ_RDMA
|
||||
depends on BLOCK && INFINIBAND
|
||||
default y
|
||||
|
||||
config BLK_PM
|
||||
def_bool BLOCK && PM
|
||||
|
||||
source block/Kconfig.iosched
|
||||
|
@@ -36,7 +36,6 @@ config IOSCHED_CFQ
|
||||
config CFQ_GROUP_IOSCHED
|
||||
bool "CFQ Group Scheduling support"
|
||||
depends on IOSCHED_CFQ && BLK_CGROUP
|
||||
default n
|
||||
---help---
|
||||
Enable group IO scheduling in CFQ.
|
||||
|
||||
@@ -82,7 +81,6 @@ config MQ_IOSCHED_KYBER
|
||||
|
||||
config IOSCHED_BFQ
|
||||
tristate "BFQ I/O scheduler"
|
||||
default n
|
||||
---help---
|
||||
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
|
||||
of the device among all processes according to their weights,
|
||||
@@ -94,7 +92,6 @@ config IOSCHED_BFQ
|
||||
config BFQ_GROUP_IOSCHED
|
||||
bool "BFQ hierarchical scheduling support"
|
||||
depends on IOSCHED_BFQ && BLK_CGROUP
|
||||
default n
|
||||
---help---
|
||||
|
||||
Enable hierarchical scheduling in BFQ, using the blkio
|
||||
|
@@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
|
||||
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
|
||||
obj-$(CONFIG_BLK_PM) += blk-pm.o
|
||||
|
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
|
||||
uint64_t serial_nr;
|
||||
|
||||
rcu_read_lock();
|
||||
serial_nr = bio_blkcg(bio)->css.serial_nr;
|
||||
serial_nr = __bio_blkcg(bio)->css.serial_nr;
|
||||
|
||||
/*
|
||||
* Check whether blkcg has changed. The condition may trigger
|
||||
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
|
||||
if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
|
||||
goto out;
|
||||
|
||||
bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
|
||||
bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
|
||||
/*
|
||||
* Update blkg_path for bfq_log_* functions. We cache this
|
||||
* path, and update it here, for the following
|
||||
|
@@ -624,12 +624,13 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell whether there are active queues or groups with differentiated weights.
|
||||
* Tell whether there are active queues with different weights or
|
||||
* active groups.
|
||||
*/
|
||||
static bool bfq_differentiated_weights(struct bfq_data *bfqd)
|
||||
static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
|
||||
{
|
||||
/*
|
||||
* For weights to differ, at least one of the trees must contain
|
||||
* For queue weights to differ, queue_weights_tree must contain
|
||||
* at least two nodes.
|
||||
*/
|
||||
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
|
||||
@@ -637,9 +638,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
|
||||
bfqd->queue_weights_tree.rb_node->rb_right)
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
) ||
|
||||
(!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
|
||||
(bfqd->group_weights_tree.rb_node->rb_left ||
|
||||
bfqd->group_weights_tree.rb_node->rb_right)
|
||||
(bfqd->num_active_groups > 0
|
||||
#endif
|
||||
);
|
||||
}
|
||||
@@ -657,26 +656,25 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
|
||||
* 3) all active groups at the same level in the groups tree have the same
|
||||
* number of children.
|
||||
*
|
||||
* Unfortunately, keeping the necessary state for evaluating exactly the
|
||||
* above symmetry conditions would be quite complex and time-consuming.
|
||||
* Therefore this function evaluates, instead, the following stronger
|
||||
* sub-conditions, for which it is much easier to maintain the needed
|
||||
* state:
|
||||
* Unfortunately, keeping the necessary state for evaluating exactly
|
||||
* the last two symmetry sub-conditions above would be quite complex
|
||||
* and time consuming. Therefore this function evaluates, instead,
|
||||
* only the following stronger two sub-conditions, for which it is
|
||||
* much easier to maintain the needed state:
|
||||
* 1) all active queues have the same weight,
|
||||
* 2) all active groups have the same weight,
|
||||
* 3) all active groups have at most one active child each.
|
||||
* In particular, the last two conditions are always true if hierarchical
|
||||
* support and the cgroups interface are not enabled, thus no state needs
|
||||
* to be maintained in this case.
|
||||
* 2) there are no active groups.
|
||||
* In particular, the last condition is always true if hierarchical
|
||||
* support or the cgroups interface are not enabled, thus no state
|
||||
* needs to be maintained in this case.
|
||||
*/
|
||||
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
|
||||
{
|
||||
return !bfq_differentiated_weights(bfqd);
|
||||
return !bfq_varied_queue_weights_or_active_groups(bfqd);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the weight-counter tree passed as input contains no counter for
|
||||
* the weight of the input entity, then add that counter; otherwise just
|
||||
* the weight of the input queue, then add that counter; otherwise just
|
||||
* increment the existing counter.
|
||||
*
|
||||
* Note that weight-counter trees contain few nodes in mostly symmetric
|
||||
@@ -687,25 +685,25 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
|
||||
* In most scenarios, the rate at which nodes are created/destroyed
|
||||
* should be low too.
|
||||
*/
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
struct rb_node **new = &(root->rb_node), *parent = NULL;
|
||||
|
||||
/*
|
||||
* Do not insert if the entity is already associated with a
|
||||
* Do not insert if the queue is already associated with a
|
||||
* counter, which happens if:
|
||||
* 1) the entity is associated with a queue,
|
||||
* 2) a request arrival has caused the queue to become both
|
||||
* 1) a request arrival has caused the queue to become both
|
||||
* non-weight-raised, and hence change its weight, and
|
||||
* backlogged; in this respect, each of the two events
|
||||
* causes an invocation of this function,
|
||||
* 3) this is the invocation of this function caused by the
|
||||
* 2) this is the invocation of this function caused by the
|
||||
* second event. This second invocation is actually useless,
|
||||
* and we handle this fact by exiting immediately. More
|
||||
* efficient or clearer solutions might possibly be adopted.
|
||||
*/
|
||||
if (entity->weight_counter)
|
||||
if (bfqq->weight_counter)
|
||||
return;
|
||||
|
||||
while (*new) {
|
||||
@@ -715,7 +713,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
parent = *new;
|
||||
|
||||
if (entity->weight == __counter->weight) {
|
||||
entity->weight_counter = __counter;
|
||||
bfqq->weight_counter = __counter;
|
||||
goto inc_counter;
|
||||
}
|
||||
if (entity->weight < __counter->weight)
|
||||
@@ -724,66 +722,67 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
new = &((*new)->rb_right);
|
||||
}
|
||||
|
||||
entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
|
||||
GFP_ATOMIC);
|
||||
bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
|
||||
GFP_ATOMIC);
|
||||
|
||||
/*
|
||||
* In the unlucky event of an allocation failure, we just
|
||||
* exit. This will cause the weight of entity to not be
|
||||
* considered in bfq_differentiated_weights, which, in its
|
||||
* turn, causes the scenario to be deemed wrongly symmetric in
|
||||
* case entity's weight would have been the only weight making
|
||||
* the scenario asymmetric. On the bright side, no unbalance
|
||||
* will however occur when entity becomes inactive again (the
|
||||
* invocation of this function is triggered by an activation
|
||||
* of entity). In fact, bfq_weights_tree_remove does nothing
|
||||
* if !entity->weight_counter.
|
||||
* exit. This will cause the weight of queue to not be
|
||||
* considered in bfq_varied_queue_weights_or_active_groups,
|
||||
* which, in its turn, causes the scenario to be deemed
|
||||
* wrongly symmetric in case bfqq's weight would have been
|
||||
* the only weight making the scenario asymmetric. On the
|
||||
* bright side, no unbalance will however occur when bfqq
|
||||
* becomes inactive again (the invocation of this function
|
||||
* is triggered by an activation of queue). In fact,
|
||||
* bfq_weights_tree_remove does nothing if
|
||||
* !bfqq->weight_counter.
|
||||
*/
|
||||
if (unlikely(!entity->weight_counter))
|
||||
if (unlikely(!bfqq->weight_counter))
|
||||
return;
|
||||
|
||||
entity->weight_counter->weight = entity->weight;
|
||||
rb_link_node(&entity->weight_counter->weights_node, parent, new);
|
||||
rb_insert_color(&entity->weight_counter->weights_node, root);
|
||||
bfqq->weight_counter->weight = entity->weight;
|
||||
rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
|
||||
rb_insert_color(&bfqq->weight_counter->weights_node, root);
|
||||
|
||||
inc_counter:
|
||||
entity->weight_counter->num_active++;
|
||||
bfqq->weight_counter->num_active++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrement the weight counter associated with the entity, and, if the
|
||||
* Decrement the weight counter associated with the queue, and, if the
|
||||
* counter reaches 0, remove the counter from the tree.
|
||||
* See the comments to the function bfq_weights_tree_add() for considerations
|
||||
* about overhead.
|
||||
*/
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_entity *entity,
|
||||
struct bfq_queue *bfqq,
|
||||
struct rb_root *root)
|
||||
{
|
||||
if (!entity->weight_counter)
|
||||
if (!bfqq->weight_counter)
|
||||
return;
|
||||
|
||||
entity->weight_counter->num_active--;
|
||||
if (entity->weight_counter->num_active > 0)
|
||||
bfqq->weight_counter->num_active--;
|
||||
if (bfqq->weight_counter->num_active > 0)
|
||||
goto reset_entity_pointer;
|
||||
|
||||
rb_erase(&entity->weight_counter->weights_node, root);
|
||||
kfree(entity->weight_counter);
|
||||
rb_erase(&bfqq->weight_counter->weights_node, root);
|
||||
kfree(bfqq->weight_counter);
|
||||
|
||||
reset_entity_pointer:
|
||||
entity->weight_counter = NULL;
|
||||
bfqq->weight_counter = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoke __bfq_weights_tree_remove on bfqq and all its inactive
|
||||
* parent entities.
|
||||
* Invoke __bfq_weights_tree_remove on bfqq and decrement the number
|
||||
* of active groups for each queue's inactive parent entity.
|
||||
*/
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = bfqq->entity.parent;
|
||||
|
||||
__bfq_weights_tree_remove(bfqd, &bfqq->entity,
|
||||
__bfq_weights_tree_remove(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
|
||||
for_each_entity(entity) {
|
||||
@@ -797,17 +796,13 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
* next_in_service for details on why
|
||||
* in_service_entity must be checked too).
|
||||
*
|
||||
* As a consequence, the weight of entity is
|
||||
* not to be removed. In addition, if entity
|
||||
* is active, then its parent entities are
|
||||
* active as well, and thus their weights are
|
||||
* not to be removed either. In the end, this
|
||||
* loop must stop here.
|
||||
* As a consequence, its parent entities are
|
||||
* active as well, and thus this loop must
|
||||
* stop here.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
__bfq_weights_tree_remove(bfqd, entity,
|
||||
&bfqd->group_weights_tree);
|
||||
bfqd->num_active_groups--;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3182,6 +3177,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
|
||||
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
|
||||
}
|
||||
|
||||
static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
|
||||
{
|
||||
return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
|
||||
blk_queue_nonrot(bfqq->bfqd->queue) &&
|
||||
bfqq->bfqd->hw_tag;
|
||||
}
|
||||
|
||||
/**
|
||||
* bfq_bfqq_expire - expire a queue.
|
||||
* @bfqd: device owning the queue.
|
||||
@@ -3291,6 +3293,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
|
||||
if (ref == 1) /* bfqq is gone, no more actions on it */
|
||||
return;
|
||||
|
||||
bfqq->injected_service = 0;
|
||||
|
||||
/* mark bfqq as waiting a request only if a bic still points to it */
|
||||
if (!bfq_bfqq_busy(bfqq) &&
|
||||
reason != BFQQE_BUDGET_TIMEOUT &&
|
||||
@@ -3497,9 +3501,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* symmetric scenario where:
|
||||
* (i) each of these processes must get the same throughput as
|
||||
* the others;
|
||||
* (ii) all these processes have the same I/O pattern
|
||||
(either sequential or random).
|
||||
* In fact, in such a scenario, the drive will tend to treat
|
||||
* (ii) the I/O of each process has the same properties, in
|
||||
* terms of locality (sequential or random), direction
|
||||
* (reads or writes), request sizes, greediness
|
||||
* (from I/O-bound to sporadic), and so on.
|
||||
* In fact, in such a scenario, the drive tends to treat
|
||||
* the requests of each of these processes in about the same
|
||||
* way as the requests of the others, and thus to provide
|
||||
* each of these processes with about the same throughput
|
||||
@@ -3508,18 +3514,50 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* certainly needed to guarantee that bfqq receives its
|
||||
* assigned fraction of the device throughput (see [1] for
|
||||
* details).
|
||||
* The problem is that idling may significantly reduce
|
||||
* throughput with certain combinations of types of I/O and
|
||||
* devices. An important example is sync random I/O, on flash
|
||||
* storage with command queueing. So, unless bfqq falls in the
|
||||
* above cases where idling also boosts throughput, it would
|
||||
* be important to check conditions (i) and (ii) accurately,
|
||||
* so as to avoid idling when not strictly needed for service
|
||||
* guarantees.
|
||||
*
|
||||
* We address this issue by controlling, actually, only the
|
||||
* symmetry sub-condition (i), i.e., provided that
|
||||
* sub-condition (i) holds, idling is not performed,
|
||||
* regardless of whether sub-condition (ii) holds. In other
|
||||
* words, only if sub-condition (i) holds, then idling is
|
||||
* Unfortunately, it is extremely difficult to thoroughly
|
||||
* check condition (ii). And, in case there are active groups,
|
||||
* it becomes very difficult to check condition (i) too. In
|
||||
* fact, if there are active groups, then, for condition (i)
|
||||
* to become false, it is enough that an active group contains
|
||||
* more active processes or sub-groups than some other active
|
||||
* group. We address this issue with the following bi-modal
|
||||
* behavior, implemented in the function
|
||||
* bfq_symmetric_scenario().
|
||||
*
|
||||
* If there are active groups, then the scenario is tagged as
|
||||
* asymmetric, conservatively, without checking any of the
|
||||
* conditions (i) and (ii). So the device is idled for bfqq.
|
||||
* This behavior matches also the fact that groups are created
|
||||
* exactly if controlling I/O (to preserve bandwidth and
|
||||
* latency guarantees) is a primary concern.
|
||||
*
|
||||
* On the opposite end, if there are no active groups, then
|
||||
* only condition (i) is actually controlled, i.e., provided
|
||||
* that condition (i) holds, idling is not performed,
|
||||
* regardless of whether condition (ii) holds. In other words,
|
||||
* only if condition (i) does not hold, then idling is
|
||||
* allowed, and the device tends to be prevented from queueing
|
||||
* many requests, possibly of several processes. The reason
|
||||
* for not controlling also sub-condition (ii) is that we
|
||||
* exploit preemption to preserve guarantees in case of
|
||||
* symmetric scenarios, even if (ii) does not hold, as
|
||||
* explained in the next two paragraphs.
|
||||
* many requests, possibly of several processes. Since there
|
||||
* are no active groups, then, to control condition (i) it is
|
||||
* enough to check whether all active queues have the same
|
||||
* weight.
|
||||
*
|
||||
* Not checking condition (ii) evidently exposes bfqq to the
|
||||
* risk of getting less throughput than its fair share.
|
||||
* However, for queues with the same weight, a further
|
||||
* mechanism, preemption, mitigates or even eliminates this
|
||||
* problem. And it does so without consequences on overall
|
||||
* throughput. This mechanism and its benefits are explained
|
||||
* in the next three paragraphs.
|
||||
*
|
||||
* Even if a queue, say Q, is expired when it remains idle, Q
|
||||
* can still preempt the new in-service queue if the next
|
||||
@@ -3533,11 +3571,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* idling allows the internal queues of the device to contain
|
||||
* many requests, and thus to reorder requests, we can rather
|
||||
* safely assume that the internal scheduler still preserves a
|
||||
* minimum of mid-term fairness. The motivation for using
|
||||
* preemption instead of idling is that, by not idling,
|
||||
* service guarantees are preserved without minimally
|
||||
* sacrificing throughput. In other words, both a high
|
||||
* throughput and its desired distribution are obtained.
|
||||
* minimum of mid-term fairness.
|
||||
*
|
||||
* More precisely, this preemption-based, idleless approach
|
||||
* provides fairness in terms of IOPS, and not sectors per
|
||||
@@ -3556,22 +3590,27 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* 1024/8 times as high as the service received by the other
|
||||
* queue.
|
||||
*
|
||||
* On the other hand, device idling is performed, and thus
|
||||
* pure sector-domain guarantees are provided, for the
|
||||
* following queues, which are likely to need stronger
|
||||
* throughput guarantees: weight-raised queues, and queues
|
||||
* with a higher weight than other queues. When such queues
|
||||
* are active, sub-condition (i) is false, which triggers
|
||||
* device idling.
|
||||
* The motivation for using preemption instead of idling (for
|
||||
* queues with the same weight) is that, by not idling,
|
||||
* service guarantees are preserved (completely or at least in
|
||||
* part) without minimally sacrificing throughput. And, if
|
||||
* there is no active group, then the primary expectation for
|
||||
* this device is probably a high throughput.
|
||||
*
|
||||
* According to the above considerations, the next variable is
|
||||
* true (only) if sub-condition (i) holds. To compute the
|
||||
* value of this variable, we not only use the return value of
|
||||
* the function bfq_symmetric_scenario(), but also check
|
||||
* whether bfqq is being weight-raised, because
|
||||
* bfq_symmetric_scenario() does not take into account also
|
||||
* weight-raised queues (see comments on
|
||||
* bfq_weights_tree_add()).
|
||||
* We are now left only with explaining the additional
|
||||
* compound condition that is checked below for deciding
|
||||
* whether the scenario is asymmetric. To explain this
|
||||
* compound condition, we need to add that the function
|
||||
* bfq_symmetric_scenario checks the weights of only
|
||||
* non-weight-raised queues, for efficiency reasons (see
|
||||
* comments on bfq_weights_tree_add()). Then the fact that
|
||||
* bfqq is weight-raised is checked explicitly here. More
|
||||
* precisely, the compound condition below takes into account
|
||||
* also the fact that, even if bfqq is being weight-raised,
|
||||
* the scenario is still symmetric if all active queues happen
|
||||
* to be weight-raised. Actually, we should be even more
|
||||
* precise here, and differentiate between interactive weight
|
||||
* raising and soft real-time weight raising.
|
||||
*
|
||||
* As a side note, it is worth considering that the above
|
||||
* device-idling countermeasures may however fail in the
|
||||
@@ -3583,7 +3622,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
|
||||
* to let requests be served in the desired order until all
|
||||
* the requests already queued in the device have been served.
|
||||
*/
|
||||
asymmetric_scenario = bfqq->wr_coeff > 1 ||
|
||||
asymmetric_scenario = (bfqq->wr_coeff > 1 &&
|
||||
bfqd->wr_busy_queues < bfqd->busy_queues) ||
|
||||
!bfq_symmetric_scenario(bfqd);
|
||||
|
||||
/*
|
||||
@@ -3629,6 +3669,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
|
||||
return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
|
||||
}
|
||||
|
||||
static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
|
||||
{
|
||||
struct bfq_queue *bfqq;
|
||||
|
||||
/*
|
||||
* A linear search; but, with a high probability, very few
|
||||
* steps are needed to find a candidate queue, i.e., a queue
|
||||
* with enough budget left for its next request. In fact:
|
||||
* - BFQ dynamically updates the budget of every queue so as
|
||||
* to accommodate the expected backlog of the queue;
|
||||
* - if a queue gets all its requests dispatched as injected
|
||||
* service, then the queue is removed from the active list
|
||||
* (and re-added only if it gets new requests, but with
|
||||
* enough budget for its new backlog).
|
||||
*/
|
||||
list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
|
||||
if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
|
||||
bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
|
||||
bfq_bfqq_budget_left(bfqq))
|
||||
return bfqq;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Select a queue for service. If we have a current queue in service,
|
||||
* check whether to continue servicing it, or retrieve and set a new one.
|
||||
@@ -3710,10 +3774,19 @@ check_queue:
|
||||
* No requests pending. However, if the in-service queue is idling
|
||||
* for a new request, or has requests waiting for a completion and
|
||||
* may idle after their completion, then keep it anyway.
|
||||
*
|
||||
* Yet, to boost throughput, inject service from other queues if
|
||||
* possible.
|
||||
*/
|
||||
if (bfq_bfqq_wait_request(bfqq) ||
|
||||
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
|
||||
bfqq = NULL;
|
||||
if (bfq_bfqq_injectable(bfqq) &&
|
||||
bfqq->injected_service * bfqq->inject_coeff <
|
||||
bfqq->entity.service * 10)
|
||||
bfqq = bfq_choose_bfqq_for_injection(bfqd);
|
||||
else
|
||||
bfqq = NULL;
|
||||
|
||||
goto keep_queue;
|
||||
}
|
||||
|
||||
@@ -3803,6 +3876,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
|
||||
|
||||
bfq_dispatch_remove(bfqd->queue, rq);
|
||||
|
||||
if (bfqq != bfqd->in_service_queue) {
|
||||
if (likely(bfqd->in_service_queue))
|
||||
bfqd->in_service_queue->injected_service +=
|
||||
bfq_serv_to_charge(rq, bfqq);
|
||||
|
||||
goto return_rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* If weight raising has to terminate for bfqq, then next
|
||||
* function causes an immediate update of bfqq's weight,
|
||||
@@ -3821,13 +3902,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
|
||||
* belongs to CLASS_IDLE and other queues are waiting for
|
||||
* service.
|
||||
*/
|
||||
if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
|
||||
goto expire;
|
||||
if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
|
||||
goto return_rq;
|
||||
|
||||
return rq;
|
||||
|
||||
expire:
|
||||
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
|
||||
|
||||
return_rq:
|
||||
return rq;
|
||||
}
|
||||
|
||||
@@ -4232,6 +4312,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfq_mark_bfqq_has_short_ttime(bfqq);
|
||||
bfq_mark_bfqq_sync(bfqq);
|
||||
bfq_mark_bfqq_just_created(bfqq);
|
||||
/*
|
||||
* Aggressively inject a lot of service: up to 90%.
|
||||
* This coefficient remains constant during bfqq life,
|
||||
* but this behavior might be changed, after enough
|
||||
* testing and tuning.
|
||||
*/
|
||||
bfqq->inject_coeff = 1;
|
||||
} else
|
||||
bfq_clear_bfqq_sync(bfqq);
|
||||
|
||||
@@ -4297,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
|
||||
bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
|
||||
if (!bfqg) {
|
||||
bfqq = &bfqd->oom_bfqq;
|
||||
goto out;
|
||||
@@ -5330,7 +5417,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
||||
bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
|
||||
|
||||
bfqd->queue_weights_tree = RB_ROOT;
|
||||
bfqd->group_weights_tree = RB_ROOT;
|
||||
bfqd->num_active_groups = 0;
|
||||
|
||||
INIT_LIST_HEAD(&bfqd->active_list);
|
||||
INIT_LIST_HEAD(&bfqd->idle_list);
|
||||
|
@@ -108,15 +108,14 @@ struct bfq_sched_data {
|
||||
};
|
||||
|
||||
/**
|
||||
* struct bfq_weight_counter - counter of the number of all active entities
|
||||
* struct bfq_weight_counter - counter of the number of all active queues
|
||||
* with a given weight.
|
||||
*/
|
||||
struct bfq_weight_counter {
|
||||
unsigned int weight; /* weight of the entities this counter refers to */
|
||||
unsigned int num_active; /* nr of active entities with this weight */
|
||||
unsigned int weight; /* weight of the queues this counter refers to */
|
||||
unsigned int num_active; /* nr of active queues with this weight */
|
||||
/*
|
||||
* Weights tree member (see bfq_data's @queue_weights_tree and
|
||||
* @group_weights_tree)
|
||||
* Weights tree member (see bfq_data's @queue_weights_tree)
|
||||
*/
|
||||
struct rb_node weights_node;
|
||||
};
|
||||
@@ -151,8 +150,6 @@ struct bfq_weight_counter {
|
||||
struct bfq_entity {
|
||||
/* service_tree member */
|
||||
struct rb_node rb_node;
|
||||
/* pointer to the weight counter associated with this entity */
|
||||
struct bfq_weight_counter *weight_counter;
|
||||
|
||||
/*
|
||||
* Flag, true if the entity is on a tree (either the active or
|
||||
@@ -266,6 +263,9 @@ struct bfq_queue {
|
||||
/* entity representing this queue in the scheduler */
|
||||
struct bfq_entity entity;
|
||||
|
||||
/* pointer to the weight counter associated with this entity */
|
||||
struct bfq_weight_counter *weight_counter;
|
||||
|
||||
/* maximum budget allowed from the feedback mechanism */
|
||||
int max_budget;
|
||||
/* budget expiration (in jiffies) */
|
||||
@@ -351,6 +351,32 @@ struct bfq_queue {
|
||||
unsigned long split_time; /* time of last split */
|
||||
|
||||
unsigned long first_IO_time; /* time of first I/O for this queue */
|
||||
|
||||
/* max service rate measured so far */
|
||||
u32 max_service_rate;
|
||||
/*
|
||||
* Ratio between the service received by bfqq while it is in
|
||||
* service, and the cumulative service (of requests of other
|
||||
* queues) that may be injected while bfqq is empty but still
|
||||
* in service. To increase precision, the coefficient is
|
||||
* measured in tenths of unit. Here are some example of (1)
|
||||
* ratios, (2) resulting percentages of service injected
|
||||
* w.r.t. to the total service dispatched while bfqq is in
|
||||
* service, and (3) corresponding values of the coefficient:
|
||||
* 1 (50%) -> 10
|
||||
* 2 (33%) -> 20
|
||||
* 10 (9%) -> 100
|
||||
* 9.9 (9%) -> 99
|
||||
* 1.5 (40%) -> 15
|
||||
* 0.5 (66%) -> 5
|
||||
* 0.1 (90%) -> 1
|
||||
*
|
||||
* So, if the coefficient is lower than 10, then
|
||||
* injected service is more than bfqq service.
|
||||
*/
|
||||
unsigned int inject_coeff;
|
||||
/* amount of service injected in current service slot */
|
||||
unsigned int injected_service;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -423,14 +449,9 @@ struct bfq_data {
|
||||
*/
|
||||
struct rb_root queue_weights_tree;
|
||||
/*
|
||||
* rbtree of non-queue @bfq_entity weight counters, sorted by
|
||||
* weight. Used to keep track of whether all @bfq_groups have
|
||||
* the same weight. The tree contains one counter for each
|
||||
* distinct weight associated to some active @bfq_group (see
|
||||
* the comments to the functions bfq_weights_tree_[add|remove]
|
||||
* for further details).
|
||||
* number of groups with requests still waiting for completion
|
||||
*/
|
||||
struct rb_root group_weights_tree;
|
||||
unsigned int num_active_groups;
|
||||
|
||||
/*
|
||||
* Number of bfq_queues containing requests (including the
|
||||
@@ -825,10 +846,10 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
|
||||
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
|
||||
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
|
||||
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
struct rb_root *root);
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_entity *entity,
|
||||
struct bfq_queue *bfqq,
|
||||
struct rb_root *root);
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq);
|
||||
|
@@ -788,25 +788,29 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
||||
new_weight = entity->orig_weight *
|
||||
(bfqq ? bfqq->wr_coeff : 1);
|
||||
/*
|
||||
* If the weight of the entity changes, remove the entity
|
||||
* from its old weight counter (if there is a counter
|
||||
* associated with the entity), and add it to the counter
|
||||
* associated with its new weight.
|
||||
* If the weight of the entity changes, and the entity is a
|
||||
* queue, remove the entity from its old weight counter (if
|
||||
* there is a counter associated with the entity).
|
||||
*/
|
||||
if (prev_weight != new_weight) {
|
||||
root = bfqq ? &bfqd->queue_weights_tree :
|
||||
&bfqd->group_weights_tree;
|
||||
__bfq_weights_tree_remove(bfqd, entity, root);
|
||||
if (bfqq) {
|
||||
root = &bfqd->queue_weights_tree;
|
||||
__bfq_weights_tree_remove(bfqd, bfqq, root);
|
||||
} else
|
||||
bfqd->num_active_groups--;
|
||||
}
|
||||
entity->weight = new_weight;
|
||||
/*
|
||||
* Add the entity to its weights tree only if it is
|
||||
* not associated with a weight-raised queue.
|
||||
* Add the entity, if it is not a weight-raised queue,
|
||||
* to the counter associated with its new weight.
|
||||
*/
|
||||
if (prev_weight != new_weight &&
|
||||
(bfqq ? bfqq->wr_coeff == 1 : 1))
|
||||
/* If we get here, root has been initialized. */
|
||||
bfq_weights_tree_add(bfqd, entity, root);
|
||||
if (prev_weight != new_weight) {
|
||||
if (bfqq && bfqq->wr_coeff == 1) {
|
||||
/* If we get here, root has been initialized. */
|
||||
bfq_weights_tree_add(bfqd, bfqq, root);
|
||||
} else
|
||||
bfqd->num_active_groups++;
|
||||
}
|
||||
|
||||
new_st->wsum += entity->weight;
|
||||
|
||||
@@ -1012,9 +1016,9 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
|
||||
if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
|
||||
struct bfq_group *bfqg =
|
||||
container_of(entity, struct bfq_group, entity);
|
||||
struct bfq_data *bfqd = bfqg->bfqd;
|
||||
|
||||
bfq_weights_tree_add(bfqg->bfqd, entity,
|
||||
&bfqd->group_weights_tree);
|
||||
bfqd->num_active_groups++;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1181,10 +1185,17 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
|
||||
st = bfq_entity_service_tree(entity);
|
||||
is_in_service = entity == sd->in_service_entity;
|
||||
|
||||
if (is_in_service) {
|
||||
bfq_calc_finish(entity, entity->service);
|
||||
bfq_calc_finish(entity, entity->service);
|
||||
|
||||
if (is_in_service)
|
||||
sd->in_service_entity = NULL;
|
||||
}
|
||||
else
|
||||
/*
|
||||
* Non in-service entity: nobody will take care of
|
||||
* resetting its service counter on expiration. Do it
|
||||
* now.
|
||||
*/
|
||||
entity->service = 0;
|
||||
|
||||
if (entity->tree == &st->active)
|
||||
bfq_active_extract(st, entity);
|
||||
@@ -1685,7 +1696,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
if (bfqq->wr_coeff == 1)
|
||||
bfq_weights_tree_add(bfqd, &bfqq->entity,
|
||||
bfq_weights_tree_add(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
|
||||
if (bfqq->wr_coeff > 1)
|
||||
|
@@ -306,6 +306,8 @@ bool bio_integrity_prep(struct bio *bio)
|
||||
if (bio_data_dir(bio) == WRITE) {
|
||||
bio_integrity_process(bio, &bio->bi_iter,
|
||||
bi->profile->generate_fn);
|
||||
} else {
|
||||
bip->bio_iter = bio->bi_iter;
|
||||
}
|
||||
return true;
|
||||
|
||||
@@ -331,20 +333,14 @@ static void bio_integrity_verify_fn(struct work_struct *work)
|
||||
container_of(work, struct bio_integrity_payload, bip_work);
|
||||
struct bio *bio = bip->bip_bio;
|
||||
struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
|
||||
/*
|
||||
* At the moment verify is called bio's iterator was advanced
|
||||
* during split and completion, we need to rewind iterator to
|
||||
* it's original position.
|
||||
*/
|
||||
if (bio_rewind_iter(bio, &iter, iter.bi_done)) {
|
||||
bio->bi_status = bio_integrity_process(bio, &iter,
|
||||
bi->profile->verify_fn);
|
||||
} else {
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
}
|
||||
|
||||
bio->bi_status = bio_integrity_process(bio, &bip->bio_iter,
|
||||
bi->profile->verify_fn);
|
||||
bio_integrity_free(bio);
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
250
block/bio.c
250
block/bio.c
@@ -609,7 +609,9 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
|
||||
bio->bi_iter = bio_src->bi_iter;
|
||||
bio->bi_io_vec = bio_src->bi_io_vec;
|
||||
|
||||
bio_clone_blkcg_association(bio, bio_src);
|
||||
bio_clone_blkg_association(bio, bio_src);
|
||||
|
||||
blkcg_bio_issue_init(bio);
|
||||
}
|
||||
EXPORT_SYMBOL(__bio_clone_fast);
|
||||
|
||||
@@ -729,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
|
||||
}
|
||||
|
||||
/* If we may be able to merge these biovecs, force a recount */
|
||||
if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
|
||||
if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
|
||||
bio_clear_flag(bio, BIO_SEG_VALID);
|
||||
|
||||
done:
|
||||
@@ -827,6 +829,8 @@ int bio_add_page(struct bio *bio, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL(bio_add_page);
|
||||
|
||||
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
|
||||
|
||||
/**
|
||||
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
|
||||
* @bio: bio to add pages to
|
||||
@@ -839,38 +843,35 @@ EXPORT_SYMBOL(bio_add_page);
|
||||
*/
|
||||
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
|
||||
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
|
||||
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
|
||||
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
|
||||
struct page **pages = (struct page **)bv;
|
||||
ssize_t size, left;
|
||||
unsigned len, i;
|
||||
size_t offset;
|
||||
ssize_t size;
|
||||
|
||||
/*
|
||||
* Move page array up in the allocated memory for the bio vecs as far as
|
||||
* possible so that we can start filling biovecs from the beginning
|
||||
* without overwriting the temporary page array.
|
||||
*/
|
||||
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
|
||||
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
|
||||
|
||||
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
|
||||
if (unlikely(size <= 0))
|
||||
return size ? size : -EFAULT;
|
||||
idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
|
||||
|
||||
/*
|
||||
* Deep magic below: We need to walk the pinned pages backwards
|
||||
* because we are abusing the space allocated for the bio_vecs
|
||||
* for the page array. Because the bio_vecs are larger than the
|
||||
* page pointers by definition this will always work. But it also
|
||||
* means we can't use bio_add_page, so any changes to it's semantics
|
||||
* need to be reflected here as well.
|
||||
*/
|
||||
bio->bi_iter.bi_size += size;
|
||||
bio->bi_vcnt += nr_pages;
|
||||
for (left = size, i = 0; left > 0; left -= len, i++) {
|
||||
struct page *page = pages[i];
|
||||
|
||||
while (idx--) {
|
||||
bv[idx].bv_page = pages[idx];
|
||||
bv[idx].bv_len = PAGE_SIZE;
|
||||
bv[idx].bv_offset = 0;
|
||||
len = min_t(size_t, PAGE_SIZE - offset, left);
|
||||
if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len))
|
||||
return -EINVAL;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
bv[0].bv_offset += offset;
|
||||
bv[0].bv_len -= offset;
|
||||
bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
|
||||
|
||||
iov_iter_advance(iter, size);
|
||||
return 0;
|
||||
}
|
||||
@@ -1807,7 +1808,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
|
||||
bio_integrity_trim(split);
|
||||
|
||||
bio_advance(bio, split->bi_iter.bi_size);
|
||||
bio->bi_iter.bi_done = 0;
|
||||
|
||||
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
|
||||
bio_set_flag(split, BIO_TRACE_COMPLETION);
|
||||
@@ -1956,71 +1956,153 @@ EXPORT_SYMBOL(bioset_init_from_src);
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
/**
|
||||
* bio_associate_blkcg_from_page - associate a bio with the page's blkcg
|
||||
* @bio: target bio
|
||||
* @page: the page to lookup the blkcg from
|
||||
*
|
||||
* Associate @bio with the blkcg from @page's owning memcg. This works like
|
||||
* every other associate function wrt references.
|
||||
*/
|
||||
int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
|
||||
{
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
|
||||
if (unlikely(bio->bi_css))
|
||||
return -EBUSY;
|
||||
if (!page->mem_cgroup)
|
||||
return 0;
|
||||
blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
|
||||
&io_cgrp_subsys);
|
||||
bio->bi_css = blkcg_css;
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/**
|
||||
* bio_associate_blkcg - associate a bio with the specified blkcg
|
||||
* @bio: target bio
|
||||
* @blkcg_css: css of the blkcg to associate
|
||||
*
|
||||
* Associate @bio with the blkcg specified by @blkcg_css. Block layer will
|
||||
* treat @bio as if it were issued by a task which belongs to the blkcg.
|
||||
*
|
||||
* This function takes an extra reference of @blkcg_css which will be put
|
||||
* when @bio is released. The caller must own @bio and is responsible for
|
||||
* synchronizing calls to this function.
|
||||
*/
|
||||
int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
|
||||
{
|
||||
if (unlikely(bio->bi_css))
|
||||
return -EBUSY;
|
||||
css_get(blkcg_css);
|
||||
bio->bi_css = blkcg_css;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
|
||||
|
||||
/**
|
||||
* bio_associate_blkg - associate a bio with the specified blkg
|
||||
* bio_associate_blkg - associate a bio with the a blkg
|
||||
* @bio: target bio
|
||||
* @blkg: the blkg to associate
|
||||
*
|
||||
* Associate @bio with the blkg specified by @blkg. This is the queue specific
|
||||
* blkcg information associated with the @bio, a reference will be taken on the
|
||||
* @blkg and will be freed when the bio is freed.
|
||||
* This tries to associate @bio with the specified blkg. Association failure
|
||||
* is handled by walking up the blkg tree. Therefore, the blkg associated can
|
||||
* be anything between @blkg and the root_blkg. This situation only happens
|
||||
* when a cgroup is dying and then the remaining bios will spill to the closest
|
||||
* alive blkg.
|
||||
*
|
||||
* A reference will be taken on the @blkg and will be released when @bio is
|
||||
* freed.
|
||||
*/
|
||||
int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
|
||||
{
|
||||
if (unlikely(bio->bi_blkg))
|
||||
return -EBUSY;
|
||||
if (!blkg_try_get(blkg))
|
||||
return -ENODEV;
|
||||
bio->bi_blkg = blkg;
|
||||
bio->bi_blkg = blkg_tryget_closest(blkg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* __bio_associate_blkg_from_css - internal blkg association function
|
||||
*
|
||||
* This in the core association function that all association paths rely on.
|
||||
* A blkg reference is taken which is released upon freeing of the bio.
|
||||
*/
|
||||
static int __bio_associate_blkg_from_css(struct bio *bio,
|
||||
struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct request_queue *q = bio->bi_disk->queue;
|
||||
struct blkcg_gq *blkg;
|
||||
int ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (!css || !css->parent)
|
||||
blkg = q->root_blkg;
|
||||
else
|
||||
blkg = blkg_lookup_create(css_to_blkcg(css), q);
|
||||
|
||||
ret = bio_associate_blkg(bio, blkg);
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_associate_blkg_from_css - associate a bio with a specified css
|
||||
* @bio: target bio
|
||||
* @css: target css
|
||||
*
|
||||
* Associate @bio with the blkg found by combining the css's blkg and the
|
||||
* request_queue of the @bio. This falls back to the queue's root_blkg if
|
||||
* the association fails with the css.
|
||||
*/
|
||||
int bio_associate_blkg_from_css(struct bio *bio,
|
||||
struct cgroup_subsys_state *css)
|
||||
{
|
||||
if (unlikely(bio->bi_blkg))
|
||||
return -EBUSY;
|
||||
return __bio_associate_blkg_from_css(bio, css);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
/**
|
||||
* bio_associate_blkg_from_page - associate a bio with the page's blkg
|
||||
* @bio: target bio
|
||||
* @page: the page to lookup the blkcg from
|
||||
*
|
||||
* Associate @bio with the blkg from @page's owning memcg and the respective
|
||||
* request_queue. If cgroup_e_css returns NULL, fall back to the queue's
|
||||
* root_blkg.
|
||||
*
|
||||
* Note: this must be called after bio has an associated device.
|
||||
*/
|
||||
int bio_associate_blkg_from_page(struct bio *bio, struct page *page)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
int ret;
|
||||
|
||||
if (unlikely(bio->bi_blkg))
|
||||
return -EBUSY;
|
||||
if (!page->mem_cgroup)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
|
||||
|
||||
ret = __bio_associate_blkg_from_css(bio, css);
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/**
|
||||
* bio_associate_create_blkg - associate a bio with a blkg from q
|
||||
* @q: request_queue where bio is going
|
||||
* @bio: target bio
|
||||
*
|
||||
* Associate @bio with the blkg found from the bio's css and the request_queue.
|
||||
* If one is not found, bio_lookup_blkg creates the blkg. This falls back to
|
||||
* the queue's root_blkg if association fails.
|
||||
*/
|
||||
int bio_associate_create_blkg(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
int ret = 0;
|
||||
|
||||
/* someone has already associated this bio with a blkg */
|
||||
if (bio->bi_blkg)
|
||||
return ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
css = blkcg_css();
|
||||
|
||||
ret = __bio_associate_blkg_from_css(bio, css);
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_reassociate_blkg - reassociate a bio with a blkg from q
|
||||
* @q: request_queue where bio is going
|
||||
* @bio: target bio
|
||||
*
|
||||
* When submitting a bio, multiple recursive calls to make_request() may occur.
|
||||
* This causes the initial associate done in blkcg_bio_issue_check() to be
|
||||
* incorrect and reference the prior request_queue. This performs reassociation
|
||||
* when this situation happens.
|
||||
*/
|
||||
int bio_reassociate_blkg(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
if (bio->bi_blkg) {
|
||||
blkg_put(bio->bi_blkg);
|
||||
bio->bi_blkg = NULL;
|
||||
}
|
||||
|
||||
return bio_associate_create_blkg(q, bio);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_disassociate_task - undo bio_associate_current()
|
||||
* @bio: target bio
|
||||
@@ -2031,10 +2113,6 @@ void bio_disassociate_task(struct bio *bio)
|
||||
put_io_context(bio->bi_ioc);
|
||||
bio->bi_ioc = NULL;
|
||||
}
|
||||
if (bio->bi_css) {
|
||||
css_put(bio->bi_css);
|
||||
bio->bi_css = NULL;
|
||||
}
|
||||
if (bio->bi_blkg) {
|
||||
blkg_put(bio->bi_blkg);
|
||||
bio->bi_blkg = NULL;
|
||||
@@ -2042,16 +2120,16 @@ void bio_disassociate_task(struct bio *bio)
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_clone_blkcg_association - clone blkcg association from src to dst bio
|
||||
* bio_clone_blkg_association - clone blkg association from src to dst bio
|
||||
* @dst: destination bio
|
||||
* @src: source bio
|
||||
*/
|
||||
void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
|
||||
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
|
||||
{
|
||||
if (src->bi_css)
|
||||
WARN_ON(bio_associate_blkcg(dst, src->bi_css));
|
||||
if (src->bi_blkg)
|
||||
bio_associate_blkg(dst, src->bi_blkg);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
|
||||
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
|
||||
#endif /* CONFIG_BLK_CGROUP */
|
||||
|
||||
static void __init biovec_init_slabs(void)
|
||||
|
@@ -84,6 +84,37 @@ static void blkg_free(struct blkcg_gq *blkg)
|
||||
kfree(blkg);
|
||||
}
|
||||
|
||||
static void __blkg_release(struct rcu_head *rcu)
|
||||
{
|
||||
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
|
||||
|
||||
percpu_ref_exit(&blkg->refcnt);
|
||||
|
||||
/* release the blkcg and parent blkg refs this blkg has been holding */
|
||||
css_put(&blkg->blkcg->css);
|
||||
if (blkg->parent)
|
||||
blkg_put(blkg->parent);
|
||||
|
||||
wb_congested_put(blkg->wb_congested);
|
||||
|
||||
blkg_free(blkg);
|
||||
}
|
||||
|
||||
/*
|
||||
* A group is RCU protected, but having an rcu lock does not mean that one
|
||||
* can access all the fields of blkg and assume these are valid. For
|
||||
* example, don't try to follow throtl_data and request queue links.
|
||||
*
|
||||
* Having a reference to blkg under an rcu allows accesses to only values
|
||||
* local to groups like group stats and group rate limits.
|
||||
*/
|
||||
static void blkg_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
|
||||
|
||||
call_rcu(&blkg->rcu_head, __blkg_release);
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_alloc - allocate a blkg
|
||||
* @blkcg: block cgroup the new blkg is associated with
|
||||
@@ -110,7 +141,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
|
||||
blkg->q = q;
|
||||
INIT_LIST_HEAD(&blkg->q_node);
|
||||
blkg->blkcg = blkcg;
|
||||
atomic_set(&blkg->refcnt, 1);
|
||||
|
||||
/* root blkg uses @q->root_rl, init rl only for !root blkgs */
|
||||
if (blkcg != &blkcg_root) {
|
||||
@@ -217,6 +247,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
|
||||
blkg_get(blkg->parent);
|
||||
}
|
||||
|
||||
ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
|
||||
GFP_NOWAIT | __GFP_NOWARN);
|
||||
if (ret)
|
||||
goto err_cancel_ref;
|
||||
|
||||
/* invoke per-policy init */
|
||||
for (i = 0; i < BLKCG_MAX_POLS; i++) {
|
||||
struct blkcg_policy *pol = blkcg_policy[i];
|
||||
@@ -249,6 +284,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
|
||||
blkg_put(blkg);
|
||||
return ERR_PTR(ret);
|
||||
|
||||
err_cancel_ref:
|
||||
percpu_ref_exit(&blkg->refcnt);
|
||||
err_put_congested:
|
||||
wb_congested_put(wb_congested);
|
||||
err_put_css:
|
||||
@@ -259,7 +296,7 @@ err_free_blkg:
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_lookup_create - lookup blkg, try to create one if not there
|
||||
* __blkg_lookup_create - lookup blkg, try to create one if not there
|
||||
* @blkcg: blkcg of interest
|
||||
* @q: request_queue of interest
|
||||
*
|
||||
@@ -268,12 +305,11 @@ err_free_blkg:
|
||||
* that all non-root blkg's have access to the parent blkg. This function
|
||||
* should be called under RCU read lock and @q->queue_lock.
|
||||
*
|
||||
* Returns pointer to the looked up or created blkg on success, ERR_PTR()
|
||||
* value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
|
||||
* dead and bypassing, returns ERR_PTR(-EBUSY).
|
||||
* Returns the blkg or the closest blkg if blkg_create fails as it walks
|
||||
* down from root.
|
||||
*/
|
||||
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
|
||||
struct request_queue *q)
|
||||
struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blkcg_gq *blkg;
|
||||
|
||||
@@ -285,7 +321,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
|
||||
* we shouldn't allow anything to go through for a bypassing queue.
|
||||
*/
|
||||
if (unlikely(blk_queue_bypass(q)))
|
||||
return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
|
||||
return q->root_blkg;
|
||||
|
||||
blkg = __blkg_lookup(blkcg, q, true);
|
||||
if (blkg)
|
||||
@@ -293,23 +329,58 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
|
||||
|
||||
/*
|
||||
* Create blkgs walking down from blkcg_root to @blkcg, so that all
|
||||
* non-root blkgs have access to their parents.
|
||||
* non-root blkgs have access to their parents. Returns the closest
|
||||
* blkg to the intended blkg should blkg_create() fail.
|
||||
*/
|
||||
while (true) {
|
||||
struct blkcg *pos = blkcg;
|
||||
struct blkcg *parent = blkcg_parent(blkcg);
|
||||
struct blkcg_gq *ret_blkg = q->root_blkg;
|
||||
|
||||
while (parent && !__blkg_lookup(parent, q, false)) {
|
||||
while (parent) {
|
||||
blkg = __blkg_lookup(parent, q, false);
|
||||
if (blkg) {
|
||||
/* remember closest blkg */
|
||||
ret_blkg = blkg;
|
||||
break;
|
||||
}
|
||||
pos = parent;
|
||||
parent = blkcg_parent(parent);
|
||||
}
|
||||
|
||||
blkg = blkg_create(pos, q, NULL);
|
||||
if (pos == blkcg || IS_ERR(blkg))
|
||||
if (IS_ERR(blkg))
|
||||
return ret_blkg;
|
||||
if (pos == blkcg)
|
||||
return blkg;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* blkg_lookup_create - find or create a blkg
|
||||
* @blkcg: target block cgroup
|
||||
* @q: target request_queue
|
||||
*
|
||||
* This looks up or creates the blkg representing the unique pair
|
||||
* of the blkcg and the request_queue.
|
||||
*/
|
||||
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
|
||||
unsigned long flags;
|
||||
|
||||
if (unlikely(!blkg)) {
|
||||
spin_lock_irqsave(q->queue_lock, flags);
|
||||
|
||||
blkg = __blkg_lookup_create(blkcg, q);
|
||||
|
||||
spin_unlock_irqrestore(q->queue_lock, flags);
|
||||
}
|
||||
|
||||
return blkg;
|
||||
}
|
||||
|
||||
static void blkg_destroy(struct blkcg_gq *blkg)
|
||||
{
|
||||
struct blkcg *blkcg = blkg->blkcg;
|
||||
@@ -353,7 +424,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
|
||||
* Put the reference taken at the time of creation so that when all
|
||||
* queues are gone, group can be destroyed.
|
||||
*/
|
||||
blkg_put(blkg);
|
||||
percpu_ref_kill(&blkg->refcnt);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -380,29 +451,6 @@ static void blkg_destroy_all(struct request_queue *q)
|
||||
q->root_rl.blkg = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* A group is RCU protected, but having an rcu lock does not mean that one
|
||||
* can access all the fields of blkg and assume these are valid. For
|
||||
* example, don't try to follow throtl_data and request queue links.
|
||||
*
|
||||
* Having a reference to blkg under an rcu allows accesses to only values
|
||||
* local to groups like group stats and group rate limits.
|
||||
*/
|
||||
void __blkg_release_rcu(struct rcu_head *rcu_head)
|
||||
{
|
||||
struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
|
||||
|
||||
/* release the blkcg and parent blkg refs this blkg has been holding */
|
||||
css_put(&blkg->blkcg->css);
|
||||
if (blkg->parent)
|
||||
blkg_put(blkg->parent);
|
||||
|
||||
wb_congested_put(blkg->wb_congested);
|
||||
|
||||
blkg_free(blkg);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__blkg_release_rcu);
|
||||
|
||||
/*
|
||||
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
|
||||
* because the root blkg uses @q->root_rl instead of its own rl.
|
||||
@@ -1748,8 +1796,7 @@ void blkcg_maybe_throttle_current(void)
|
||||
blkg = blkg_lookup(blkcg, q);
|
||||
if (!blkg)
|
||||
goto out;
|
||||
blkg = blkg_try_get(blkg);
|
||||
if (!blkg)
|
||||
if (!blkg_tryget(blkg))
|
||||
goto out;
|
||||
rcu_read_unlock();
|
||||
|
||||
|
276
block/blk-core.c
276
block/blk-core.c
@@ -42,6 +42,7 @@
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-pm.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
@@ -421,24 +422,25 @@ void blk_sync_queue(struct request_queue *q)
|
||||
EXPORT_SYMBOL(blk_sync_queue);
|
||||
|
||||
/**
|
||||
* blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
|
||||
* blk_set_pm_only - increment pm_only counter
|
||||
* @q: request queue pointer
|
||||
*
|
||||
* Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
|
||||
* set and 1 if the flag was already set.
|
||||
*/
|
||||
int blk_set_preempt_only(struct request_queue *q)
|
||||
void blk_set_pm_only(struct request_queue *q)
|
||||
{
|
||||
return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
|
||||
atomic_inc(&q->pm_only);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_set_preempt_only);
|
||||
EXPORT_SYMBOL_GPL(blk_set_pm_only);
|
||||
|
||||
void blk_clear_preempt_only(struct request_queue *q)
|
||||
void blk_clear_pm_only(struct request_queue *q)
|
||||
{
|
||||
blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
|
||||
wake_up_all(&q->mq_freeze_wq);
|
||||
int pm_only;
|
||||
|
||||
pm_only = atomic_dec_return(&q->pm_only);
|
||||
WARN_ON_ONCE(pm_only < 0);
|
||||
if (pm_only == 0)
|
||||
wake_up_all(&q->mq_freeze_wq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
|
||||
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
|
||||
|
||||
/**
|
||||
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
|
||||
@@ -917,7 +919,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
|
||||
*/
|
||||
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
|
||||
{
|
||||
const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
|
||||
const bool pm = flags & BLK_MQ_REQ_PREEMPT;
|
||||
|
||||
while (true) {
|
||||
bool success = false;
|
||||
@@ -925,11 +927,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
|
||||
rcu_read_lock();
|
||||
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
|
||||
/*
|
||||
* The code that sets the PREEMPT_ONLY flag is
|
||||
* responsible for ensuring that that flag is globally
|
||||
* visible before the queue is unfrozen.
|
||||
* The code that increments the pm_only counter is
|
||||
* responsible for ensuring that that counter is
|
||||
* globally visible before the queue is unfrozen.
|
||||
*/
|
||||
if (preempt || !blk_queue_preempt_only(q)) {
|
||||
if (pm || !blk_queue_pm_only(q)) {
|
||||
success = true;
|
||||
} else {
|
||||
percpu_ref_put(&q->q_usage_counter);
|
||||
@@ -954,7 +956,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
|
||||
|
||||
wait_event(q->mq_freeze_wq,
|
||||
(atomic_read(&q->mq_freeze_depth) == 0 &&
|
||||
(preempt || !blk_queue_preempt_only(q))) ||
|
||||
(pm || (blk_pm_request_resume(q),
|
||||
!blk_queue_pm_only(q)))) ||
|
||||
blk_queue_dying(q));
|
||||
if (blk_queue_dying(q))
|
||||
return -ENODEV;
|
||||
@@ -1051,8 +1054,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
|
||||
mutex_init(&q->sysfs_lock);
|
||||
spin_lock_init(&q->__queue_lock);
|
||||
|
||||
if (!q->mq_ops)
|
||||
q->queue_lock = lock ? : &q->__queue_lock;
|
||||
q->queue_lock = lock ? : &q->__queue_lock;
|
||||
|
||||
/*
|
||||
* A queue starts its life with bypass turned on to avoid
|
||||
@@ -1160,7 +1162,7 @@ int blk_init_allocated_queue(struct request_queue *q)
|
||||
{
|
||||
WARN_ON_ONCE(q->mq_ops);
|
||||
|
||||
q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
|
||||
q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
|
||||
if (!q->fq)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -1726,16 +1728,6 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(part_round_stats);
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
static void blk_pm_put_request(struct request *rq)
|
||||
{
|
||||
if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
|
||||
pm_runtime_mark_last_busy(rq->q->dev);
|
||||
}
|
||||
#else
|
||||
static inline void blk_pm_put_request(struct request *rq) {}
|
||||
#endif
|
||||
|
||||
void __blk_put_request(struct request_queue *q, struct request *req)
|
||||
{
|
||||
req_flags_t rq_flags = req->rq_flags;
|
||||
@@ -1752,6 +1744,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
|
||||
|
||||
blk_req_zone_write_unlock(req);
|
||||
blk_pm_put_request(req);
|
||||
blk_pm_mark_last_busy(req);
|
||||
|
||||
elv_completed_request(q, req);
|
||||
|
||||
@@ -2440,6 +2433,7 @@ blk_qc_t generic_make_request(struct bio *bio)
|
||||
if (q)
|
||||
blk_queue_exit(q);
|
||||
q = bio->bi_disk->queue;
|
||||
bio_reassociate_blkg(q, bio);
|
||||
flags = 0;
|
||||
if (bio->bi_opf & REQ_NOWAIT)
|
||||
flags = BLK_MQ_REQ_NOWAIT;
|
||||
@@ -2750,30 +2744,6 @@ void blk_account_io_done(struct request *req, u64 now)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
/*
|
||||
* Don't process normal requests when queue is suspended
|
||||
* or in the process of suspending/resuming
|
||||
*/
|
||||
static bool blk_pm_allow_request(struct request *rq)
|
||||
{
|
||||
switch (rq->q->rpm_status) {
|
||||
case RPM_RESUMING:
|
||||
case RPM_SUSPENDING:
|
||||
return rq->rq_flags & RQF_PM;
|
||||
case RPM_SUSPENDED:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static bool blk_pm_allow_request(struct request *rq)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
void blk_account_io_start(struct request *rq, bool new_io)
|
||||
{
|
||||
struct hd_struct *part;
|
||||
@@ -2819,11 +2789,14 @@ static struct request *elv_next_request(struct request_queue *q)
|
||||
|
||||
while (1) {
|
||||
list_for_each_entry(rq, &q->queue_head, queuelist) {
|
||||
if (blk_pm_allow_request(rq))
|
||||
return rq;
|
||||
|
||||
if (rq->rq_flags & RQF_SOFTBARRIER)
|
||||
break;
|
||||
#ifdef CONFIG_PM
|
||||
/*
|
||||
* If a request gets queued in state RPM_SUSPENDED
|
||||
* then that's a kernel bug.
|
||||
*/
|
||||
WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
|
||||
#endif
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3755,191 +3728,6 @@ void blk_finish_plug(struct blk_plug *plug)
|
||||
}
|
||||
EXPORT_SYMBOL(blk_finish_plug);
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
/**
|
||||
* blk_pm_runtime_init - Block layer runtime PM initialization routine
|
||||
* @q: the queue of the device
|
||||
* @dev: the device the queue belongs to
|
||||
*
|
||||
* Description:
|
||||
* Initialize runtime-PM-related fields for @q and start auto suspend for
|
||||
* @dev. Drivers that want to take advantage of request-based runtime PM
|
||||
* should call this function after @dev has been initialized, and its
|
||||
* request queue @q has been allocated, and runtime PM for it can not happen
|
||||
* yet(either due to disabled/forbidden or its usage_count > 0). In most
|
||||
* cases, driver should call this function before any I/O has taken place.
|
||||
*
|
||||
* This function takes care of setting up using auto suspend for the device,
|
||||
* the autosuspend delay is set to -1 to make runtime suspend impossible
|
||||
* until an updated value is either set by user or by driver. Drivers do
|
||||
* not need to touch other autosuspend settings.
|
||||
*
|
||||
* The block layer runtime PM is request based, so only works for drivers
|
||||
* that use request as their IO unit instead of those directly use bio's.
|
||||
*/
|
||||
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
|
||||
{
|
||||
/* Don't enable runtime PM for blk-mq until it is ready */
|
||||
if (q->mq_ops) {
|
||||
pm_runtime_disable(dev);
|
||||
return;
|
||||
}
|
||||
|
||||
q->dev = dev;
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_set_autosuspend_delay(q->dev, -1);
|
||||
pm_runtime_use_autosuspend(q->dev);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pm_runtime_init);
|
||||
|
||||
/**
|
||||
* blk_pre_runtime_suspend - Pre runtime suspend check
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* Description:
|
||||
* This function will check if runtime suspend is allowed for the device
|
||||
* by examining if there are any requests pending in the queue. If there
|
||||
* are requests pending, the device can not be runtime suspended; otherwise,
|
||||
* the queue's status will be updated to SUSPENDING and the driver can
|
||||
* proceed to suspend the device.
|
||||
*
|
||||
* For the not allowed case, we mark last busy for the device so that
|
||||
* runtime PM core will try to autosuspend it some time later.
|
||||
*
|
||||
* This function should be called near the start of the device's
|
||||
* runtime_suspend callback.
|
||||
*
|
||||
* Return:
|
||||
* 0 - OK to runtime suspend the device
|
||||
* -EBUSY - Device should not be runtime suspended
|
||||
*/
|
||||
int blk_pre_runtime_suspend(struct request_queue *q)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (!q->dev)
|
||||
return ret;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (q->nr_pending) {
|
||||
ret = -EBUSY;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
} else {
|
||||
q->rpm_status = RPM_SUSPENDING;
|
||||
}
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pre_runtime_suspend);
|
||||
|
||||
/**
|
||||
* blk_post_runtime_suspend - Post runtime suspend processing
|
||||
* @q: the queue of the device
|
||||
* @err: return value of the device's runtime_suspend function
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status according to the return value of the
|
||||
* device's runtime suspend function and mark last busy for the device so
|
||||
* that PM core will try to auto suspend the device at a later time.
|
||||
*
|
||||
* This function should be called near the end of the device's
|
||||
* runtime_suspend callback.
|
||||
*/
|
||||
void blk_post_runtime_suspend(struct request_queue *q, int err)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (!err) {
|
||||
q->rpm_status = RPM_SUSPENDED;
|
||||
} else {
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
}
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_post_runtime_suspend);
|
||||
|
||||
/**
|
||||
* blk_pre_runtime_resume - Pre runtime resume processing
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status to RESUMING in preparation for the
|
||||
* runtime resume of the device.
|
||||
*
|
||||
* This function should be called near the start of the device's
|
||||
* runtime_resume callback.
|
||||
*/
|
||||
void blk_pre_runtime_resume(struct request_queue *q)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->rpm_status = RPM_RESUMING;
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pre_runtime_resume);
|
||||
|
||||
/**
|
||||
* blk_post_runtime_resume - Post runtime resume processing
|
||||
* @q: the queue of the device
|
||||
* @err: return value of the device's runtime_resume function
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status according to the return value of the
|
||||
* device's runtime_resume function. If it is successfully resumed, process
|
||||
* the requests that are queued into the device's queue when it is resuming
|
||||
* and then mark last busy and initiate autosuspend for it.
|
||||
*
|
||||
* This function should be called near the end of the device's
|
||||
* runtime_resume callback.
|
||||
*/
|
||||
void blk_post_runtime_resume(struct request_queue *q, int err)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (!err) {
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
__blk_run_queue(q);
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
} else {
|
||||
q->rpm_status = RPM_SUSPENDED;
|
||||
}
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_post_runtime_resume);
|
||||
|
||||
/**
|
||||
* blk_set_runtime_active - Force runtime status of the queue to be active
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* If the device is left runtime suspended during system suspend the resume
|
||||
* hook typically resumes the device and corrects runtime status
|
||||
* accordingly. However, that does not affect the queue runtime PM status
|
||||
* which is still "suspended". This prevents processing requests from the
|
||||
* queue.
|
||||
*
|
||||
* This function can be used in driver's resume hook to correct queue
|
||||
* runtime PM status and re-enable peeking requests from the queue. It
|
||||
* should be called before first request is added to the queue.
|
||||
*/
|
||||
void blk_set_runtime_active(struct request_queue *q)
|
||||
{
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_set_runtime_active);
|
||||
#endif
|
||||
|
||||
int __init blk_dev_init(void)
|
||||
{
|
||||
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
|
||||
|
@@ -566,12 +566,12 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
|
||||
EXPORT_SYMBOL(blkdev_issue_flush);
|
||||
|
||||
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
|
||||
int node, int cmd_size)
|
||||
int node, int cmd_size, gfp_t flags)
|
||||
{
|
||||
struct blk_flush_queue *fq;
|
||||
int rq_sz = sizeof(struct request);
|
||||
|
||||
fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
|
||||
fq = kzalloc_node(sizeof(*fq), flags, node);
|
||||
if (!fq)
|
||||
goto fail;
|
||||
|
||||
@@ -579,7 +579,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
|
||||
spin_lock_init(&fq->mq_flush_lock);
|
||||
|
||||
rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
|
||||
fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
|
||||
fq->flush_rq = kzalloc_node(rq_sz, flags, node);
|
||||
if (!fq->flush_rq)
|
||||
goto fail_rq;
|
||||
|
||||
|
@@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
|
||||
bio_for_each_integrity_vec(iv, bio, iter) {
|
||||
|
||||
if (prev) {
|
||||
if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
|
||||
if (!biovec_phys_mergeable(q, &ivprv, &iv))
|
||||
goto new_segment;
|
||||
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
|
||||
goto new_segment;
|
||||
|
||||
if (seg_size + iv.bv_len > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
|
||||
@@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
|
||||
bio_for_each_integrity_vec(iv, bio, iter) {
|
||||
|
||||
if (prev) {
|
||||
if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
|
||||
if (!biovec_phys_mergeable(q, &ivprv, &iv))
|
||||
goto new_segment;
|
||||
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
|
||||
goto new_segment;
|
||||
|
||||
if (sg->length + iv.bv_len > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
|
||||
|
@@ -115,9 +115,22 @@ struct child_latency_info {
|
||||
atomic_t scale_cookie;
|
||||
};
|
||||
|
||||
struct percentile_stats {
|
||||
u64 total;
|
||||
u64 missed;
|
||||
};
|
||||
|
||||
struct latency_stat {
|
||||
union {
|
||||
struct percentile_stats ps;
|
||||
struct blk_rq_stat rqs;
|
||||
};
|
||||
};
|
||||
|
||||
struct iolatency_grp {
|
||||
struct blkg_policy_data pd;
|
||||
struct blk_rq_stat __percpu *stats;
|
||||
struct latency_stat __percpu *stats;
|
||||
struct latency_stat cur_stat;
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct rq_depth rq_depth;
|
||||
struct rq_wait rq_wait;
|
||||
@@ -132,6 +145,7 @@ struct iolatency_grp {
|
||||
/* Our current number of IO's for the last summation. */
|
||||
u64 nr_samples;
|
||||
|
||||
bool ssd;
|
||||
struct child_latency_info child_lat;
|
||||
};
|
||||
|
||||
@@ -172,6 +186,80 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
|
||||
return pd_to_blkg(&iolat->pd);
|
||||
}
|
||||
|
||||
static inline void latency_stat_init(struct iolatency_grp *iolat,
|
||||
struct latency_stat *stat)
|
||||
{
|
||||
if (iolat->ssd) {
|
||||
stat->ps.total = 0;
|
||||
stat->ps.missed = 0;
|
||||
} else
|
||||
blk_rq_stat_init(&stat->rqs);
|
||||
}
|
||||
|
||||
static inline void latency_stat_sum(struct iolatency_grp *iolat,
|
||||
struct latency_stat *sum,
|
||||
struct latency_stat *stat)
|
||||
{
|
||||
if (iolat->ssd) {
|
||||
sum->ps.total += stat->ps.total;
|
||||
sum->ps.missed += stat->ps.missed;
|
||||
} else
|
||||
blk_rq_stat_sum(&sum->rqs, &stat->rqs);
|
||||
}
|
||||
|
||||
static inline void latency_stat_record_time(struct iolatency_grp *iolat,
|
||||
u64 req_time)
|
||||
{
|
||||
struct latency_stat *stat = get_cpu_ptr(iolat->stats);
|
||||
if (iolat->ssd) {
|
||||
if (req_time >= iolat->min_lat_nsec)
|
||||
stat->ps.missed++;
|
||||
stat->ps.total++;
|
||||
} else
|
||||
blk_rq_stat_add(&stat->rqs, req_time);
|
||||
put_cpu_ptr(stat);
|
||||
}
|
||||
|
||||
static inline bool latency_sum_ok(struct iolatency_grp *iolat,
|
||||
struct latency_stat *stat)
|
||||
{
|
||||
if (iolat->ssd) {
|
||||
u64 thresh = div64_u64(stat->ps.total, 10);
|
||||
thresh = max(thresh, 1ULL);
|
||||
return stat->ps.missed < thresh;
|
||||
}
|
||||
return stat->rqs.mean <= iolat->min_lat_nsec;
|
||||
}
|
||||
|
||||
static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
|
||||
struct latency_stat *stat)
|
||||
{
|
||||
if (iolat->ssd)
|
||||
return stat->ps.total;
|
||||
return stat->rqs.nr_samples;
|
||||
}
|
||||
|
||||
static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
|
||||
struct latency_stat *stat)
|
||||
{
|
||||
int exp_idx;
|
||||
|
||||
if (iolat->ssd)
|
||||
return;
|
||||
|
||||
/*
|
||||
* CALC_LOAD takes in a number stored in fixed point representation.
|
||||
* Because we are using this for IO time in ns, the values stored
|
||||
* are significantly larger than the FIXED_1 denominator (2048).
|
||||
* Therefore, rounding errors in the calculation are negligible and
|
||||
* can be ignored.
|
||||
*/
|
||||
exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
|
||||
div64_u64(iolat->cur_win_nsec,
|
||||
BLKIOLATENCY_EXP_BUCKET_SIZE));
|
||||
CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
|
||||
}
|
||||
|
||||
static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
|
||||
wait_queue_entry_t *wait,
|
||||
bool first_block)
|
||||
@@ -255,7 +343,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
|
||||
struct child_latency_info *lat_info,
|
||||
bool up)
|
||||
{
|
||||
unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
|
||||
unsigned long qd = blkiolat->rqos.q->nr_requests;
|
||||
unsigned long scale = scale_amount(qd, up);
|
||||
unsigned long old = atomic_read(&lat_info->scale_cookie);
|
||||
unsigned long max_scale = qd << 1;
|
||||
@@ -295,10 +383,9 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
|
||||
*/
|
||||
static void scale_change(struct iolatency_grp *iolat, bool up)
|
||||
{
|
||||
unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
|
||||
unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
|
||||
unsigned long scale = scale_amount(qd, up);
|
||||
unsigned long old = iolat->rq_depth.max_depth;
|
||||
bool changed = false;
|
||||
|
||||
if (old > qd)
|
||||
old = qd;
|
||||
@@ -308,15 +395,13 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
|
||||
return;
|
||||
|
||||
if (old < qd) {
|
||||
changed = true;
|
||||
old += scale;
|
||||
old = min(old, qd);
|
||||
iolat->rq_depth.max_depth = old;
|
||||
wake_up_all(&iolat->rq_wait.wait);
|
||||
}
|
||||
} else if (old > 1) {
|
||||
} else {
|
||||
old >>= 1;
|
||||
changed = true;
|
||||
iolat->rq_depth.max_depth = max(old, 1UL);
|
||||
}
|
||||
}
|
||||
@@ -369,7 +454,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
|
||||
* scale down event.
|
||||
*/
|
||||
samples_thresh = lat_info->nr_samples * 5;
|
||||
samples_thresh = div64_u64(samples_thresh, 100);
|
||||
samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
|
||||
if (iolat->nr_samples <= samples_thresh)
|
||||
return;
|
||||
}
|
||||
@@ -395,34 +480,12 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
|
||||
spinlock_t *lock)
|
||||
{
|
||||
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
|
||||
struct blkcg *blkcg;
|
||||
struct blkcg_gq *blkg;
|
||||
struct request_queue *q = rqos->q;
|
||||
struct blkcg_gq *blkg = bio->bi_blkg;
|
||||
bool issue_as_root = bio_issue_as_root_blkg(bio);
|
||||
|
||||
if (!blk_iolatency_enabled(blkiolat))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
blkcg = bio_blkcg(bio);
|
||||
bio_associate_blkcg(bio, &blkcg->css);
|
||||
blkg = blkg_lookup(blkcg, q);
|
||||
if (unlikely(!blkg)) {
|
||||
if (!lock)
|
||||
spin_lock_irq(q->queue_lock);
|
||||
blkg = blkg_lookup_create(blkcg, q);
|
||||
if (IS_ERR(blkg))
|
||||
blkg = NULL;
|
||||
if (!lock)
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
if (!blkg)
|
||||
goto out;
|
||||
|
||||
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
|
||||
bio_associate_blkg(bio, blkg);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
while (blkg && blkg->parent) {
|
||||
struct iolatency_grp *iolat = blkg_to_lat(blkg);
|
||||
if (!iolat) {
|
||||
@@ -443,7 +506,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
|
||||
struct bio_issue *issue, u64 now,
|
||||
bool issue_as_root)
|
||||
{
|
||||
struct blk_rq_stat *rq_stat;
|
||||
u64 start = bio_issue_time(issue);
|
||||
u64 req_time;
|
||||
|
||||
@@ -469,9 +531,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
|
||||
return;
|
||||
}
|
||||
|
||||
rq_stat = get_cpu_ptr(iolat->stats);
|
||||
blk_rq_stat_add(rq_stat, req_time);
|
||||
put_cpu_ptr(rq_stat);
|
||||
latency_stat_record_time(iolat, req_time);
|
||||
}
|
||||
|
||||
#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
|
||||
@@ -482,17 +542,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
|
||||
struct blkcg_gq *blkg = lat_to_blkg(iolat);
|
||||
struct iolatency_grp *parent;
|
||||
struct child_latency_info *lat_info;
|
||||
struct blk_rq_stat stat;
|
||||
struct latency_stat stat;
|
||||
unsigned long flags;
|
||||
int cpu, exp_idx;
|
||||
int cpu;
|
||||
|
||||
blk_rq_stat_init(&stat);
|
||||
latency_stat_init(iolat, &stat);
|
||||
preempt_disable();
|
||||
for_each_online_cpu(cpu) {
|
||||
struct blk_rq_stat *s;
|
||||
struct latency_stat *s;
|
||||
s = per_cpu_ptr(iolat->stats, cpu);
|
||||
blk_rq_stat_sum(&stat, s);
|
||||
blk_rq_stat_init(s);
|
||||
latency_stat_sum(iolat, &stat, s);
|
||||
latency_stat_init(iolat, s);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
@@ -502,41 +562,36 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
|
||||
|
||||
lat_info = &parent->child_lat;
|
||||
|
||||
/*
|
||||
* CALC_LOAD takes in a number stored in fixed point representation.
|
||||
* Because we are using this for IO time in ns, the values stored
|
||||
* are significantly larger than the FIXED_1 denominator (2048).
|
||||
* Therefore, rounding errors in the calculation are negligible and
|
||||
* can be ignored.
|
||||
*/
|
||||
exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
|
||||
div64_u64(iolat->cur_win_nsec,
|
||||
BLKIOLATENCY_EXP_BUCKET_SIZE));
|
||||
CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
|
||||
iolat_update_total_lat_avg(iolat, &stat);
|
||||
|
||||
/* Everything is ok and we don't need to adjust the scale. */
|
||||
if (stat.mean <= iolat->min_lat_nsec &&
|
||||
if (latency_sum_ok(iolat, &stat) &&
|
||||
atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
|
||||
return;
|
||||
|
||||
/* Somebody beat us to the punch, just bail. */
|
||||
spin_lock_irqsave(&lat_info->lock, flags);
|
||||
|
||||
latency_stat_sum(iolat, &iolat->cur_stat, &stat);
|
||||
lat_info->nr_samples -= iolat->nr_samples;
|
||||
lat_info->nr_samples += stat.nr_samples;
|
||||
iolat->nr_samples = stat.nr_samples;
|
||||
lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
|
||||
iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
|
||||
|
||||
if ((lat_info->last_scale_event >= now ||
|
||||
now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
|
||||
lat_info->scale_lat <= iolat->min_lat_nsec)
|
||||
now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
|
||||
goto out;
|
||||
|
||||
if (stat.mean <= iolat->min_lat_nsec &&
|
||||
stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
|
||||
if (latency_sum_ok(iolat, &iolat->cur_stat) &&
|
||||
latency_sum_ok(iolat, &stat)) {
|
||||
if (latency_stat_samples(iolat, &iolat->cur_stat) <
|
||||
BLKIOLATENCY_MIN_GOOD_SAMPLES)
|
||||
goto out;
|
||||
if (lat_info->scale_grp == iolat) {
|
||||
lat_info->last_scale_event = now;
|
||||
scale_cookie_change(iolat->blkiolat, lat_info, true);
|
||||
}
|
||||
} else if (stat.mean > iolat->min_lat_nsec) {
|
||||
} else if (lat_info->scale_lat == 0 ||
|
||||
lat_info->scale_lat >= iolat->min_lat_nsec) {
|
||||
lat_info->last_scale_event = now;
|
||||
if (!lat_info->scale_grp ||
|
||||
lat_info->scale_lat > iolat->min_lat_nsec) {
|
||||
@@ -545,6 +600,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
|
||||
}
|
||||
scale_cookie_change(iolat->blkiolat, lat_info, false);
|
||||
}
|
||||
latency_stat_init(iolat, &iolat->cur_stat);
|
||||
out:
|
||||
spin_unlock_irqrestore(&lat_info->lock, flags);
|
||||
}
|
||||
@@ -650,7 +706,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
|
||||
* We could be exiting, don't access the pd unless we have a
|
||||
* ref on the blkg.
|
||||
*/
|
||||
if (!blkg_try_get(blkg))
|
||||
if (!blkg_tryget(blkg))
|
||||
continue;
|
||||
|
||||
iolat = blkg_to_lat(blkg);
|
||||
@@ -761,7 +817,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
|
||||
{
|
||||
struct blkcg *blkcg = css_to_blkcg(of_css(of));
|
||||
struct blkcg_gq *blkg;
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct blkg_conf_ctx ctx;
|
||||
struct iolatency_grp *iolat;
|
||||
char *p, *tok;
|
||||
@@ -774,7 +829,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
|
||||
return ret;
|
||||
|
||||
iolat = blkg_to_lat(ctx.blkg);
|
||||
blkiolat = iolat->blkiolat;
|
||||
p = ctx.body;
|
||||
|
||||
ret = -EINVAL;
|
||||
@@ -835,13 +889,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
|
||||
size_t size)
|
||||
{
|
||||
struct latency_stat stat;
|
||||
int cpu;
|
||||
|
||||
latency_stat_init(iolat, &stat);
|
||||
preempt_disable();
|
||||
for_each_online_cpu(cpu) {
|
||||
struct latency_stat *s;
|
||||
s = per_cpu_ptr(iolat->stats, cpu);
|
||||
latency_stat_sum(iolat, &stat, s);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
if (iolat->rq_depth.max_depth == UINT_MAX)
|
||||
return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
|
||||
(unsigned long long)stat.ps.missed,
|
||||
(unsigned long long)stat.ps.total);
|
||||
return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
|
||||
(unsigned long long)stat.ps.missed,
|
||||
(unsigned long long)stat.ps.total,
|
||||
iolat->rq_depth.max_depth);
|
||||
}
|
||||
|
||||
static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
|
||||
size_t size)
|
||||
{
|
||||
struct iolatency_grp *iolat = pd_to_lat(pd);
|
||||
unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
|
||||
unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
|
||||
unsigned long long avg_lat;
|
||||
unsigned long long cur_win;
|
||||
|
||||
if (iolat->ssd)
|
||||
return iolatency_ssd_stat(iolat, buf, size);
|
||||
|
||||
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
|
||||
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
|
||||
if (iolat->rq_depth.max_depth == UINT_MAX)
|
||||
return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
|
||||
avg_lat, cur_win);
|
||||
@@ -858,8 +942,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
|
||||
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
|
||||
if (!iolat)
|
||||
return NULL;
|
||||
iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
|
||||
__alignof__(struct blk_rq_stat), gfp);
|
||||
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
|
||||
__alignof__(struct latency_stat), gfp);
|
||||
if (!iolat->stats) {
|
||||
kfree(iolat);
|
||||
return NULL;
|
||||
@@ -876,15 +960,21 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
|
||||
u64 now = ktime_to_ns(ktime_get());
|
||||
int cpu;
|
||||
|
||||
if (blk_queue_nonrot(blkg->q))
|
||||
iolat->ssd = true;
|
||||
else
|
||||
iolat->ssd = false;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct blk_rq_stat *stat;
|
||||
struct latency_stat *stat;
|
||||
stat = per_cpu_ptr(iolat->stats, cpu);
|
||||
blk_rq_stat_init(stat);
|
||||
latency_stat_init(iolat, stat);
|
||||
}
|
||||
|
||||
latency_stat_init(iolat, &iolat->cur_stat);
|
||||
rq_wait_init(&iolat->rq_wait);
|
||||
spin_lock_init(&iolat->child_lat.lock);
|
||||
iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
|
||||
iolat->rq_depth.queue_depth = blkg->q->nr_requests;
|
||||
iolat->rq_depth.max_depth = UINT_MAX;
|
||||
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
|
||||
iolat->blkiolat = blkiolat;
|
||||
|
@@ -12,6 +12,69 @@
|
||||
|
||||
#include "blk.h"
|
||||
|
||||
/*
|
||||
* Check if the two bvecs from two bios can be merged to one segment. If yes,
|
||||
* no need to check gap between the two bios since the 1st bio and the 1st bvec
|
||||
* in the 2nd bio can be handled in one segment.
|
||||
*/
|
||||
static inline bool bios_segs_mergeable(struct request_queue *q,
|
||||
struct bio *prev, struct bio_vec *prev_last_bv,
|
||||
struct bio_vec *next_first_bv)
|
||||
{
|
||||
if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
|
||||
return false;
|
||||
if (prev->bi_seg_back_size + next_first_bv->bv_len >
|
||||
queue_max_segment_size(q))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool bio_will_gap(struct request_queue *q,
|
||||
struct request *prev_rq, struct bio *prev, struct bio *next)
|
||||
{
|
||||
struct bio_vec pb, nb;
|
||||
|
||||
if (!bio_has_data(prev) || !queue_virt_boundary(q))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Don't merge if the 1st bio starts with non-zero offset, otherwise it
|
||||
* is quite difficult to respect the sg gap limit. We work hard to
|
||||
* merge a huge number of small single bios in case of mkfs.
|
||||
*/
|
||||
if (prev_rq)
|
||||
bio_get_first_bvec(prev_rq->bio, &pb);
|
||||
else
|
||||
bio_get_first_bvec(prev, &pb);
|
||||
if (pb.bv_offset)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* We don't need to worry about the situation that the merged segment
|
||||
* ends in unaligned virt boundary:
|
||||
*
|
||||
* - if 'pb' ends aligned, the merged segment ends aligned
|
||||
* - if 'pb' ends unaligned, the next bio must include
|
||||
* one single bvec of 'nb', otherwise the 'nb' can't
|
||||
* merge with 'pb'
|
||||
*/
|
||||
bio_get_last_bvec(prev, &pb);
|
||||
bio_get_first_bvec(next, &nb);
|
||||
if (bios_segs_mergeable(q, prev, &pb, &nb))
|
||||
return false;
|
||||
return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
|
||||
}
|
||||
|
||||
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
|
||||
{
|
||||
return bio_will_gap(req->q, req, req->biotail, bio);
|
||||
}
|
||||
|
||||
static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
|
||||
{
|
||||
return bio_will_gap(req->q, NULL, bio, req->bio);
|
||||
}
|
||||
|
||||
static struct bio *blk_bio_discard_split(struct request_queue *q,
|
||||
struct bio *bio,
|
||||
struct bio_set *bs,
|
||||
@@ -134,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
if (bvprvp && blk_queue_cluster(q)) {
|
||||
if (seg_size + bv.bv_len > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
|
||||
if (!biovec_phys_mergeable(q, bvprvp, &bv))
|
||||
goto new_segment;
|
||||
|
||||
seg_size += bv.bv_len;
|
||||
@@ -267,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
|
||||
if (seg_size + bv.bv_len
|
||||
> queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
|
||||
if (!biovec_phys_mergeable(q, &bvprv, &bv))
|
||||
goto new_segment;
|
||||
|
||||
seg_size += bv.bv_len;
|
||||
@@ -349,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
|
||||
bio_get_last_bvec(bio, &end_bv);
|
||||
bio_get_first_bvec(nxt, &nxt_bv);
|
||||
|
||||
if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* bio and nxt are contiguous in memory; check if the queue allows
|
||||
* these two to be merged into one
|
||||
*/
|
||||
if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
|
||||
}
|
||||
|
||||
static inline void
|
||||
@@ -373,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
|
||||
if (*sg && *cluster) {
|
||||
if ((*sg)->length + nbytes > queue_max_segment_size(q))
|
||||
goto new_segment;
|
||||
|
||||
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
|
||||
goto new_segment;
|
||||
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
|
||||
if (!biovec_phys_mergeable(q, bvprv, bvec))
|
||||
goto new_segment;
|
||||
|
||||
(*sg)->length += nbytes;
|
||||
|
@@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int queue_pm_only_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
|
||||
seq_printf(m, "%d\n", atomic_read(&q->pm_only));
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
|
||||
static const char *const blk_queue_flag_name[] = {
|
||||
QUEUE_FLAG_NAME(QUEUED),
|
||||
@@ -132,7 +140,6 @@ static const char *const blk_queue_flag_name[] = {
|
||||
QUEUE_FLAG_NAME(REGISTERED),
|
||||
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
|
||||
QUEUE_FLAG_NAME(QUIESCED),
|
||||
QUEUE_FLAG_NAME(PREEMPT_ONLY),
|
||||
};
|
||||
#undef QUEUE_FLAG_NAME
|
||||
|
||||
@@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
|
||||
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
|
||||
{ "poll_stat", 0400, queue_poll_stat_show },
|
||||
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
|
||||
{ "pm_only", 0600, queue_pm_only_show, NULL },
|
||||
{ "state", 0600, queue_state_show, queue_state_write },
|
||||
{ "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
|
||||
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
|
||||
@@ -423,8 +431,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
|
||||
{
|
||||
const struct show_busy_params *params = data;
|
||||
|
||||
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
|
||||
blk_mq_rq_state(rq) != MQ_RQ_IDLE)
|
||||
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx)
|
||||
__blk_mq_debugfs_rq_show(params->m,
|
||||
list_entry_rq(&rq->queuelist));
|
||||
}
|
||||
|
@@ -49,12 +49,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_completed_request(struct request *rq)
|
||||
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
|
||||
{
|
||||
struct elevator_queue *e = rq->q->elevator;
|
||||
|
||||
if (e && e->type->ops.mq.completed_request)
|
||||
e->type->ops.mq.completed_request(rq);
|
||||
e->type->ops.mq.completed_request(rq, now);
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_started_request(struct request *rq)
|
||||
|
@@ -232,13 +232,26 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
|
||||
/*
|
||||
* We can hit rq == NULL here, because the tagging functions
|
||||
* test and set the bit before assining ->rqs[].
|
||||
* test and set the bit before assigning ->rqs[].
|
||||
*/
|
||||
if (rq && rq->q == hctx->queue)
|
||||
iter_data->fn(hctx, rq, iter_data->data, reserved);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* bt_for_each - iterate over the requests associated with a hardware queue
|
||||
* @hctx: Hardware queue to examine.
|
||||
* @bt: sbitmap to examine. This is either the breserved_tags member
|
||||
* or the bitmap_tags member of struct blk_mq_tags.
|
||||
* @fn: Pointer to the function that will be called for each request
|
||||
* associated with @hctx that has been assigned a driver tag.
|
||||
* @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
|
||||
* where rq is a pointer to a request.
|
||||
* @data: Will be passed as third argument to @fn.
|
||||
* @reserved: Indicates whether @bt is the breserved_tags member or the
|
||||
* bitmap_tags member of struct blk_mq_tags.
|
||||
*/
|
||||
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
|
||||
busy_iter_fn *fn, void *data, bool reserved)
|
||||
{
|
||||
@@ -280,6 +293,18 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* bt_tags_for_each - iterate over the requests in a tag map
|
||||
* @tags: Tag map to iterate over.
|
||||
* @bt: sbitmap to examine. This is either the breserved_tags member
|
||||
* or the bitmap_tags member of struct blk_mq_tags.
|
||||
* @fn: Pointer to the function that will be called for each started
|
||||
* request. @fn will be called as follows: @fn(rq, @data,
|
||||
* @reserved) where rq is a pointer to a request.
|
||||
* @data: Will be passed as second argument to @fn.
|
||||
* @reserved: Indicates whether @bt is the breserved_tags member or the
|
||||
* bitmap_tags member of struct blk_mq_tags.
|
||||
*/
|
||||
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
|
||||
busy_tag_iter_fn *fn, void *data, bool reserved)
|
||||
{
|
||||
@@ -294,6 +319,15 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
|
||||
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
|
||||
* @tags: Tag map to iterate over.
|
||||
* @fn: Pointer to the function that will be called for each started
|
||||
* request. @fn will be called as follows: @fn(rq, @priv,
|
||||
* reserved) where rq is a pointer to a request. 'reserved'
|
||||
* indicates whether or not @rq is a reserved request.
|
||||
* @priv: Will be passed as second argument to @fn.
|
||||
*/
|
||||
static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
|
||||
busy_tag_iter_fn *fn, void *priv)
|
||||
{
|
||||
@@ -302,6 +336,15 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
|
||||
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
|
||||
* @tagset: Tag set to iterate over.
|
||||
* @fn: Pointer to the function that will be called for each started
|
||||
* request. @fn will be called as follows: @fn(rq, @priv,
|
||||
* reserved) where rq is a pointer to a request. 'reserved'
|
||||
* indicates whether or not @rq is a reserved request.
|
||||
* @priv: Will be passed as second argument to @fn.
|
||||
*/
|
||||
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
|
||||
busy_tag_iter_fn *fn, void *priv)
|
||||
{
|
||||
@@ -314,6 +357,20 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
|
||||
|
||||
/**
|
||||
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
|
||||
* @q: Request queue to examine.
|
||||
* @fn: Pointer to the function that will be called for each request
|
||||
* on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
|
||||
* reserved) where rq is a pointer to a request and hctx points
|
||||
* to the hardware queue associated with the request. 'reserved'
|
||||
* indicates whether or not @rq is a reserved request.
|
||||
* @priv: Will be passed as third argument to @fn.
|
||||
*
|
||||
* Note: if @q->tag_set is shared with other request queues then @fn will be
|
||||
* called for all requests on all queues that share that tag set and not only
|
||||
* for requests associated with @q.
|
||||
*/
|
||||
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
void *priv)
|
||||
{
|
||||
@@ -321,9 +378,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
int i;
|
||||
|
||||
/*
|
||||
* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
|
||||
* queue_hw_ctx after freeze the queue, so we use q_usage_counter
|
||||
* to avoid race with it.
|
||||
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
|
||||
* while the queue is frozen. So we can use q_usage_counter to avoid
|
||||
* racing with it. __blk_mq_update_nr_hw_queues() uses
|
||||
* synchronize_rcu() to ensure this function left the critical section
|
||||
* below.
|
||||
*/
|
||||
if (!percpu_ref_tryget(&q->q_usage_counter))
|
||||
return;
|
||||
@@ -332,7 +391,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
struct blk_mq_tags *tags = hctx->tags;
|
||||
|
||||
/*
|
||||
* If not software queues are currently mapped to this
|
||||
* If no software queues are currently mapped to this
|
||||
* hardware queue, there's nothing to check
|
||||
*/
|
||||
if (!blk_mq_hw_queue_mapped(hctx))
|
||||
|
211
block/blk-mq.c
211
block/blk-mq.c
@@ -33,6 +33,7 @@
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-debugfs.h"
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-pm.h"
|
||||
#include "blk-stat.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-rq-qos.h"
|
||||
@@ -198,7 +199,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
|
||||
freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
|
||||
WARN_ON_ONCE(freeze_depth < 0);
|
||||
if (!freeze_depth) {
|
||||
percpu_ref_reinit(&q->q_usage_counter);
|
||||
percpu_ref_resurrect(&q->q_usage_counter);
|
||||
wake_up_all(&q->mq_freeze_wq);
|
||||
}
|
||||
}
|
||||
@@ -475,6 +476,7 @@ static void __blk_mq_free_request(struct request *rq)
|
||||
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
|
||||
const int sched_tag = rq->internal_tag;
|
||||
|
||||
blk_pm_mark_last_busy(rq);
|
||||
if (rq->tag != -1)
|
||||
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
|
||||
if (sched_tag != -1)
|
||||
@@ -526,6 +528,9 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
|
||||
blk_stat_add(rq, now);
|
||||
}
|
||||
|
||||
if (rq->internal_tag != -1)
|
||||
blk_mq_sched_completed_request(rq, now);
|
||||
|
||||
blk_account_io_done(rq, now);
|
||||
|
||||
if (rq->end_io) {
|
||||
@@ -562,8 +567,20 @@ static void __blk_mq_complete_request(struct request *rq)
|
||||
|
||||
if (!blk_mq_mark_complete(rq))
|
||||
return;
|
||||
if (rq->internal_tag != -1)
|
||||
blk_mq_sched_completed_request(rq);
|
||||
|
||||
/*
|
||||
* Most of single queue controllers, there is only one irq vector
|
||||
* for handling IO completion, and the only irq's affinity is set
|
||||
* as all possible CPUs. On most of ARCHs, this affinity means the
|
||||
* irq is handled on one specific CPU.
|
||||
*
|
||||
* So complete IO reqeust in softirq context in case of single queue
|
||||
* for not degrading IO performance by irqsoff latency.
|
||||
*/
|
||||
if (rq->q->nr_hw_queues == 1) {
|
||||
__blk_complete_request(rq);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
|
||||
rq->q->softirq_done_fn(rq);
|
||||
@@ -2137,8 +2154,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
|
||||
struct blk_mq_tag_set *set,
|
||||
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
|
||||
{
|
||||
blk_mq_debugfs_unregister_hctx(hctx);
|
||||
|
||||
if (blk_mq_hw_queue_mapped(hctx))
|
||||
blk_mq_tag_idle(hctx);
|
||||
|
||||
@@ -2165,6 +2180,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (i == nr_queue)
|
||||
break;
|
||||
blk_mq_debugfs_unregister_hctx(hctx);
|
||||
blk_mq_exit_hctx(q, set, hctx, i);
|
||||
}
|
||||
}
|
||||
@@ -2194,12 +2210,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
|
||||
* runtime
|
||||
*/
|
||||
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
|
||||
GFP_KERNEL, node);
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
|
||||
if (!hctx->ctxs)
|
||||
goto unregister_cpu_notifier;
|
||||
|
||||
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
|
||||
node))
|
||||
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
|
||||
goto free_ctxs;
|
||||
|
||||
hctx->nr_ctx = 0;
|
||||
@@ -2212,7 +2228,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
|
||||
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
|
||||
goto free_bitmap;
|
||||
|
||||
hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
|
||||
hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
|
||||
if (!hctx->fq)
|
||||
goto exit_hctx;
|
||||
|
||||
@@ -2222,8 +2239,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
|
||||
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
||||
init_srcu_struct(hctx->srcu);
|
||||
|
||||
blk_mq_debugfs_register_hctx(q, hctx);
|
||||
|
||||
return 0;
|
||||
|
||||
free_fq:
|
||||
@@ -2492,6 +2507,39 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_init_queue);
|
||||
|
||||
/*
|
||||
* Helper for setting up a queue with mq ops, given queue depth, and
|
||||
* the passed in mq ops flags.
|
||||
*/
|
||||
struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops,
|
||||
unsigned int queue_depth,
|
||||
unsigned int set_flags)
|
||||
{
|
||||
struct request_queue *q;
|
||||
int ret;
|
||||
|
||||
memset(set, 0, sizeof(*set));
|
||||
set->ops = ops;
|
||||
set->nr_hw_queues = 1;
|
||||
set->queue_depth = queue_depth;
|
||||
set->numa_node = NUMA_NO_NODE;
|
||||
set->flags = set_flags;
|
||||
|
||||
ret = blk_mq_alloc_tag_set(set);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
q = blk_mq_init_queue(set);
|
||||
if (IS_ERR(q)) {
|
||||
blk_mq_free_tag_set(set);
|
||||
return q;
|
||||
}
|
||||
|
||||
return q;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_init_sq_queue);
|
||||
|
||||
static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
|
||||
{
|
||||
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
|
||||
@@ -2506,48 +2554,90 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
|
||||
return hw_ctx_size;
|
||||
}
|
||||
|
||||
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
|
||||
struct blk_mq_tag_set *set, struct request_queue *q,
|
||||
int hctx_idx, int node)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
|
||||
hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
|
||||
node);
|
||||
if (!hctx)
|
||||
return NULL;
|
||||
|
||||
if (!zalloc_cpumask_var_node(&hctx->cpumask,
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
|
||||
node)) {
|
||||
kfree(hctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
atomic_set(&hctx->nr_active, 0);
|
||||
hctx->numa_node = node;
|
||||
hctx->queue_num = hctx_idx;
|
||||
|
||||
if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) {
|
||||
free_cpumask_var(hctx->cpumask);
|
||||
kfree(hctx);
|
||||
return NULL;
|
||||
}
|
||||
blk_mq_hctx_kobj_init(hctx);
|
||||
|
||||
return hctx;
|
||||
}
|
||||
|
||||
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q)
|
||||
{
|
||||
int i, j;
|
||||
int i, j, end;
|
||||
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
|
||||
|
||||
blk_mq_sysfs_unregister(q);
|
||||
|
||||
/* protect against switching io scheduler */
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
for (i = 0; i < set->nr_hw_queues; i++) {
|
||||
int node;
|
||||
|
||||
if (hctxs[i])
|
||||
continue;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
|
||||
node = blk_mq_hw_queue_to_node(q->mq_map, i);
|
||||
hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
|
||||
GFP_KERNEL, node);
|
||||
if (!hctxs[i])
|
||||
break;
|
||||
/*
|
||||
* If the hw queue has been mapped to another numa node,
|
||||
* we need to realloc the hctx. If allocation fails, fallback
|
||||
* to use the previous one.
|
||||
*/
|
||||
if (hctxs[i] && (hctxs[i]->numa_node == node))
|
||||
continue;
|
||||
|
||||
if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
|
||||
node)) {
|
||||
kfree(hctxs[i]);
|
||||
hctxs[i] = NULL;
|
||||
break;
|
||||
hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
|
||||
if (hctx) {
|
||||
if (hctxs[i]) {
|
||||
blk_mq_exit_hctx(q, set, hctxs[i], i);
|
||||
kobject_put(&hctxs[i]->kobj);
|
||||
}
|
||||
hctxs[i] = hctx;
|
||||
} else {
|
||||
if (hctxs[i])
|
||||
pr_warn("Allocate new hctx on node %d fails,\
|
||||
fallback to previous one on node %d\n",
|
||||
node, hctxs[i]->numa_node);
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
atomic_set(&hctxs[i]->nr_active, 0);
|
||||
hctxs[i]->numa_node = node;
|
||||
hctxs[i]->queue_num = i;
|
||||
|
||||
if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
|
||||
free_cpumask_var(hctxs[i]->cpumask);
|
||||
kfree(hctxs[i]);
|
||||
hctxs[i] = NULL;
|
||||
break;
|
||||
}
|
||||
blk_mq_hctx_kobj_init(hctxs[i]);
|
||||
}
|
||||
for (j = i; j < q->nr_hw_queues; j++) {
|
||||
/*
|
||||
* Increasing nr_hw_queues fails. Free the newly allocated
|
||||
* hctxs and keep the previous q->nr_hw_queues.
|
||||
*/
|
||||
if (i != set->nr_hw_queues) {
|
||||
j = q->nr_hw_queues;
|
||||
end = i;
|
||||
} else {
|
||||
j = i;
|
||||
end = q->nr_hw_queues;
|
||||
q->nr_hw_queues = set->nr_hw_queues;
|
||||
}
|
||||
|
||||
for (; j < end; j++) {
|
||||
struct blk_mq_hw_ctx *hctx = hctxs[j];
|
||||
|
||||
if (hctx) {
|
||||
@@ -2559,9 +2649,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
||||
|
||||
}
|
||||
}
|
||||
q->nr_hw_queues = i;
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
blk_mq_sysfs_register(q);
|
||||
}
|
||||
|
||||
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
@@ -2659,25 +2747,6 @@ void blk_mq_free_queue(struct request_queue *q)
|
||||
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
|
||||
}
|
||||
|
||||
/* Basically redo blk_mq_init_queue with queue frozen */
|
||||
static void blk_mq_queue_reinit(struct request_queue *q)
|
||||
{
|
||||
WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
|
||||
|
||||
blk_mq_debugfs_unregister_hctxs(q);
|
||||
blk_mq_sysfs_unregister(q);
|
||||
|
||||
/*
|
||||
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
|
||||
* we should change hctx numa_node according to the new topology (this
|
||||
* involves freeing and re-allocating memory, worth doing?)
|
||||
*/
|
||||
blk_mq_map_swqueue(q);
|
||||
|
||||
blk_mq_sysfs_register(q);
|
||||
blk_mq_debugfs_register_hctxs(q);
|
||||
}
|
||||
|
||||
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
|
||||
{
|
||||
int i;
|
||||
@@ -2964,6 +3033,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
||||
{
|
||||
struct request_queue *q;
|
||||
LIST_HEAD(head);
|
||||
int prev_nr_hw_queues;
|
||||
|
||||
lockdep_assert_held(&set->tag_list_lock);
|
||||
|
||||
@@ -2987,11 +3057,30 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
||||
if (!blk_mq_elv_switch_none(&head, q))
|
||||
goto switch_back;
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_debugfs_unregister_hctxs(q);
|
||||
blk_mq_sysfs_unregister(q);
|
||||
}
|
||||
|
||||
prev_nr_hw_queues = set->nr_hw_queues;
|
||||
set->nr_hw_queues = nr_hw_queues;
|
||||
blk_mq_update_queue_map(set);
|
||||
fallback:
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_realloc_hw_ctxs(set, q);
|
||||
blk_mq_queue_reinit(q);
|
||||
if (q->nr_hw_queues != set->nr_hw_queues) {
|
||||
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
|
||||
nr_hw_queues, prev_nr_hw_queues);
|
||||
set->nr_hw_queues = prev_nr_hw_queues;
|
||||
blk_mq_map_queues(set);
|
||||
goto fallback;
|
||||
}
|
||||
blk_mq_map_swqueue(q);
|
||||
}
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_sysfs_register(q);
|
||||
blk_mq_debugfs_register_hctxs(q);
|
||||
}
|
||||
|
||||
switch_back:
|
||||
|
216
block/blk-pm.c
Normal file
216
block/blk-pm.c
Normal file
@@ -0,0 +1,216 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/blk-pm.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-tag.h"
|
||||
|
||||
/**
|
||||
* blk_pm_runtime_init - Block layer runtime PM initialization routine
|
||||
* @q: the queue of the device
|
||||
* @dev: the device the queue belongs to
|
||||
*
|
||||
* Description:
|
||||
* Initialize runtime-PM-related fields for @q and start auto suspend for
|
||||
* @dev. Drivers that want to take advantage of request-based runtime PM
|
||||
* should call this function after @dev has been initialized, and its
|
||||
* request queue @q has been allocated, and runtime PM for it can not happen
|
||||
* yet(either due to disabled/forbidden or its usage_count > 0). In most
|
||||
* cases, driver should call this function before any I/O has taken place.
|
||||
*
|
||||
* This function takes care of setting up using auto suspend for the device,
|
||||
* the autosuspend delay is set to -1 to make runtime suspend impossible
|
||||
* until an updated value is either set by user or by driver. Drivers do
|
||||
* not need to touch other autosuspend settings.
|
||||
*
|
||||
* The block layer runtime PM is request based, so only works for drivers
|
||||
* that use request as their IO unit instead of those directly use bio's.
|
||||
*/
|
||||
void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
|
||||
{
|
||||
q->dev = dev;
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_set_autosuspend_delay(q->dev, -1);
|
||||
pm_runtime_use_autosuspend(q->dev);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pm_runtime_init);
|
||||
|
||||
/**
|
||||
* blk_pre_runtime_suspend - Pre runtime suspend check
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* Description:
|
||||
* This function will check if runtime suspend is allowed for the device
|
||||
* by examining if there are any requests pending in the queue. If there
|
||||
* are requests pending, the device can not be runtime suspended; otherwise,
|
||||
* the queue's status will be updated to SUSPENDING and the driver can
|
||||
* proceed to suspend the device.
|
||||
*
|
||||
* For the not allowed case, we mark last busy for the device so that
|
||||
* runtime PM core will try to autosuspend it some time later.
|
||||
*
|
||||
* This function should be called near the start of the device's
|
||||
* runtime_suspend callback.
|
||||
*
|
||||
* Return:
|
||||
* 0 - OK to runtime suspend the device
|
||||
* -EBUSY - Device should not be runtime suspended
|
||||
*/
|
||||
int blk_pre_runtime_suspend(struct request_queue *q)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (!q->dev)
|
||||
return ret;
|
||||
|
||||
WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE);
|
||||
|
||||
/*
|
||||
* Increase the pm_only counter before checking whether any
|
||||
* non-PM blk_queue_enter() calls are in progress to avoid that any
|
||||
* new non-PM blk_queue_enter() calls succeed before the pm_only
|
||||
* counter is decreased again.
|
||||
*/
|
||||
blk_set_pm_only(q);
|
||||
ret = -EBUSY;
|
||||
/* Switch q_usage_counter from per-cpu to atomic mode. */
|
||||
blk_freeze_queue_start(q);
|
||||
/*
|
||||
* Wait until atomic mode has been reached. Since that
|
||||
* involves calling call_rcu(), it is guaranteed that later
|
||||
* blk_queue_enter() calls see the pm-only state. See also
|
||||
* http://lwn.net/Articles/573497/.
|
||||
*/
|
||||
percpu_ref_switch_to_atomic_sync(&q->q_usage_counter);
|
||||
if (percpu_ref_is_zero(&q->q_usage_counter))
|
||||
ret = 0;
|
||||
/* Switch q_usage_counter back to per-cpu mode. */
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (ret < 0)
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
else
|
||||
q->rpm_status = RPM_SUSPENDING;
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
|
||||
if (ret)
|
||||
blk_clear_pm_only(q);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pre_runtime_suspend);
|
||||
|
||||
/**
|
||||
* blk_post_runtime_suspend - Post runtime suspend processing
|
||||
* @q: the queue of the device
|
||||
* @err: return value of the device's runtime_suspend function
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status according to the return value of the
|
||||
* device's runtime suspend function and mark last busy for the device so
|
||||
* that PM core will try to auto suspend the device at a later time.
|
||||
*
|
||||
* This function should be called near the end of the device's
|
||||
* runtime_suspend callback.
|
||||
*/
|
||||
void blk_post_runtime_suspend(struct request_queue *q, int err)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (!err) {
|
||||
q->rpm_status = RPM_SUSPENDED;
|
||||
} else {
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
}
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
|
||||
if (err)
|
||||
blk_clear_pm_only(q);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_post_runtime_suspend);
|
||||
|
||||
/**
|
||||
* blk_pre_runtime_resume - Pre runtime resume processing
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status to RESUMING in preparation for the
|
||||
* runtime resume of the device.
|
||||
*
|
||||
* This function should be called near the start of the device's
|
||||
* runtime_resume callback.
|
||||
*/
|
||||
void blk_pre_runtime_resume(struct request_queue *q)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->rpm_status = RPM_RESUMING;
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_pre_runtime_resume);
|
||||
|
||||
/**
|
||||
* blk_post_runtime_resume - Post runtime resume processing
|
||||
* @q: the queue of the device
|
||||
* @err: return value of the device's runtime_resume function
|
||||
*
|
||||
* Description:
|
||||
* Update the queue's runtime status according to the return value of the
|
||||
* device's runtime_resume function. If it is successfully resumed, process
|
||||
* the requests that are queued into the device's queue when it is resuming
|
||||
* and then mark last busy and initiate autosuspend for it.
|
||||
*
|
||||
* This function should be called near the end of the device's
|
||||
* runtime_resume callback.
|
||||
*/
|
||||
void blk_post_runtime_resume(struct request_queue *q, int err)
|
||||
{
|
||||
if (!q->dev)
|
||||
return;
|
||||
|
||||
spin_lock_irq(q->queue_lock);
|
||||
if (!err) {
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
} else {
|
||||
q->rpm_status = RPM_SUSPENDED;
|
||||
}
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
|
||||
if (!err)
|
||||
blk_clear_pm_only(q);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_post_runtime_resume);
|
||||
|
||||
/**
|
||||
* blk_set_runtime_active - Force runtime status of the queue to be active
|
||||
* @q: the queue of the device
|
||||
*
|
||||
* If the device is left runtime suspended during system suspend the resume
|
||||
* hook typically resumes the device and corrects runtime status
|
||||
* accordingly. However, that does not affect the queue runtime PM status
|
||||
* which is still "suspended". This prevents processing requests from the
|
||||
* queue.
|
||||
*
|
||||
* This function can be used in driver's resume hook to correct queue
|
||||
* runtime PM status and re-enable peeking requests from the queue. It
|
||||
* should be called before first request is added to the queue.
|
||||
*/
|
||||
void blk_set_runtime_active(struct request_queue *q)
|
||||
{
|
||||
spin_lock_irq(q->queue_lock);
|
||||
q->rpm_status = RPM_ACTIVE;
|
||||
pm_runtime_mark_last_busy(q->dev);
|
||||
pm_request_autosuspend(q->dev);
|
||||
spin_unlock_irq(q->queue_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_set_runtime_active);
|
69
block/blk-pm.h
Normal file
69
block/blk-pm.h
Normal file
@@ -0,0 +1,69 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#ifndef _BLOCK_BLK_PM_H_
|
||||
#define _BLOCK_BLK_PM_H_
|
||||
|
||||
#include <linux/pm_runtime.h>
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
static inline void blk_pm_request_resume(struct request_queue *q)
|
||||
{
|
||||
if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
|
||||
q->rpm_status == RPM_SUSPENDING))
|
||||
pm_request_resume(q->dev);
|
||||
}
|
||||
|
||||
static inline void blk_pm_mark_last_busy(struct request *rq)
|
||||
{
|
||||
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
|
||||
pm_runtime_mark_last_busy(rq->q->dev);
|
||||
}
|
||||
|
||||
static inline void blk_pm_requeue_request(struct request *rq)
|
||||
{
|
||||
lockdep_assert_held(rq->q->queue_lock);
|
||||
|
||||
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
|
||||
rq->q->nr_pending--;
|
||||
}
|
||||
|
||||
static inline void blk_pm_add_request(struct request_queue *q,
|
||||
struct request *rq)
|
||||
{
|
||||
lockdep_assert_held(q->queue_lock);
|
||||
|
||||
if (q->dev && !(rq->rq_flags & RQF_PM))
|
||||
q->nr_pending++;
|
||||
}
|
||||
|
||||
static inline void blk_pm_put_request(struct request *rq)
|
||||
{
|
||||
lockdep_assert_held(rq->q->queue_lock);
|
||||
|
||||
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
|
||||
--rq->q->nr_pending;
|
||||
}
|
||||
#else
|
||||
static inline void blk_pm_request_resume(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void blk_pm_mark_last_busy(struct request *rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void blk_pm_requeue_request(struct request *rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void blk_pm_add_request(struct request_queue *q,
|
||||
struct request *rq)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void blk_pm_put_request(struct request *rq)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _BLOCK_BLK_PM_H_ */
|
@@ -97,8 +97,8 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
|
||||
|
||||
void __blk_complete_request(struct request *req)
|
||||
{
|
||||
int ccpu, cpu;
|
||||
struct request_queue *q = req->q;
|
||||
int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu;
|
||||
unsigned long flags;
|
||||
bool shared = false;
|
||||
|
||||
@@ -110,8 +110,7 @@ void __blk_complete_request(struct request *req)
|
||||
/*
|
||||
* Select completion CPU
|
||||
*/
|
||||
if (req->cpu != -1) {
|
||||
ccpu = req->cpu;
|
||||
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
|
||||
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
|
||||
shared = cpus_share_cache(cpu, ccpu);
|
||||
} else
|
||||
|
@@ -190,6 +190,7 @@ void blk_stat_enable_accounting(struct request_queue *q)
|
||||
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
|
||||
spin_unlock(&q->stats->lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
|
||||
|
||||
struct blk_queue_stats *blk_alloc_queue_stats(void)
|
||||
{
|
||||
|
@@ -84,8 +84,7 @@ struct throtl_service_queue {
|
||||
* RB tree of active children throtl_grp's, which are sorted by
|
||||
* their ->disptime.
|
||||
*/
|
||||
struct rb_root pending_tree; /* RB tree of active tgs */
|
||||
struct rb_node *first_pending; /* first node in the tree */
|
||||
struct rb_root_cached pending_tree; /* RB tree of active tgs */
|
||||
unsigned int nr_pending; /* # queued in the tree */
|
||||
unsigned long first_pending_disptime; /* disptime of the first tg */
|
||||
struct timer_list pending_timer; /* fires on first_pending_disptime */
|
||||
@@ -475,7 +474,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
|
||||
{
|
||||
INIT_LIST_HEAD(&sq->queued[0]);
|
||||
INIT_LIST_HEAD(&sq->queued[1]);
|
||||
sq->pending_tree = RB_ROOT;
|
||||
sq->pending_tree = RB_ROOT_CACHED;
|
||||
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
|
||||
}
|
||||
|
||||
@@ -616,31 +615,23 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
|
||||
static struct throtl_grp *
|
||||
throtl_rb_first(struct throtl_service_queue *parent_sq)
|
||||
{
|
||||
struct rb_node *n;
|
||||
/* Service tree is empty */
|
||||
if (!parent_sq->nr_pending)
|
||||
return NULL;
|
||||
|
||||
if (!parent_sq->first_pending)
|
||||
parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
|
||||
|
||||
if (parent_sq->first_pending)
|
||||
return rb_entry_tg(parent_sq->first_pending);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
|
||||
{
|
||||
rb_erase(n, root);
|
||||
RB_CLEAR_NODE(n);
|
||||
n = rb_first_cached(&parent_sq->pending_tree);
|
||||
WARN_ON_ONCE(!n);
|
||||
if (!n)
|
||||
return NULL;
|
||||
return rb_entry_tg(n);
|
||||
}
|
||||
|
||||
static void throtl_rb_erase(struct rb_node *n,
|
||||
struct throtl_service_queue *parent_sq)
|
||||
{
|
||||
if (parent_sq->first_pending == n)
|
||||
parent_sq->first_pending = NULL;
|
||||
rb_erase_init(n, &parent_sq->pending_tree);
|
||||
rb_erase_cached(n, &parent_sq->pending_tree);
|
||||
RB_CLEAR_NODE(n);
|
||||
--parent_sq->nr_pending;
|
||||
}
|
||||
|
||||
@@ -658,11 +649,11 @@ static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
|
||||
static void tg_service_queue_add(struct throtl_grp *tg)
|
||||
{
|
||||
struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
|
||||
struct rb_node **node = &parent_sq->pending_tree.rb_node;
|
||||
struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct throtl_grp *__tg;
|
||||
unsigned long key = tg->disptime;
|
||||
int left = 1;
|
||||
bool leftmost = true;
|
||||
|
||||
while (*node != NULL) {
|
||||
parent = *node;
|
||||
@@ -672,15 +663,13 @@ static void tg_service_queue_add(struct throtl_grp *tg)
|
||||
node = &parent->rb_left;
|
||||
else {
|
||||
node = &parent->rb_right;
|
||||
left = 0;
|
||||
leftmost = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (left)
|
||||
parent_sq->first_pending = &tg->rb_node;
|
||||
|
||||
rb_link_node(&tg->rb_node, parent, node);
|
||||
rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
|
||||
rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
|
||||
leftmost);
|
||||
}
|
||||
|
||||
static void __throtl_enqueue_tg(struct throtl_grp *tg)
|
||||
@@ -2126,21 +2115,11 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
/* fallback to root_blkg if we fail to get a blkg ref */
|
||||
if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
|
||||
bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
|
||||
bio_issue_init(&bio->bi_issue, bio_sectors(bio));
|
||||
#endif
|
||||
}
|
||||
|
||||
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct throtl_qnode *qn = NULL;
|
||||
struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
|
||||
struct throtl_grp *tg = blkg_to_tg(blkg);
|
||||
struct throtl_service_queue *sq;
|
||||
bool rw = bio_data_dir(bio);
|
||||
bool throttled = false;
|
||||
@@ -2159,7 +2138,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
|
||||
if (unlikely(blk_queue_bypass(q)))
|
||||
goto out_unlock;
|
||||
|
||||
blk_throtl_assoc_bio(tg, bio);
|
||||
blk_throtl_update_idletime(tg);
|
||||
|
||||
sq = &tg->service_queue;
|
||||
|
73
block/blk.h
73
block/blk.h
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/idr.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <xen/xen.h>
|
||||
#include "blk-mq.h"
|
||||
|
||||
/* Amount of time in which a process may batch requests */
|
||||
@@ -124,7 +125,7 @@ static inline void __blk_get_queue(struct request_queue *q)
|
||||
}
|
||||
|
||||
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
|
||||
int node, int cmd_size);
|
||||
int node, int cmd_size, gfp_t flags);
|
||||
void blk_free_flush_queue(struct blk_flush_queue *q);
|
||||
|
||||
int blk_init_rl(struct request_list *rl, struct request_queue *q,
|
||||
@@ -149,6 +150,41 @@ static inline void blk_queue_enter_live(struct request_queue *q)
|
||||
percpu_ref_get(&q->q_usage_counter);
|
||||
}
|
||||
|
||||
static inline bool biovec_phys_mergeable(struct request_queue *q,
|
||||
struct bio_vec *vec1, struct bio_vec *vec2)
|
||||
{
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
|
||||
phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
|
||||
|
||||
if (addr1 + vec1->bv_len != addr2)
|
||||
return false;
|
||||
if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2))
|
||||
return false;
|
||||
if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool __bvec_gap_to_prev(struct request_queue *q,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
return offset ||
|
||||
((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if adding a bio_vec after bprv with offset would create a gap in
|
||||
* the SG list. Most drivers don't care about this, but some do.
|
||||
*/
|
||||
static inline bool bvec_gap_to_prev(struct request_queue *q,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
if (!queue_virt_boundary(q))
|
||||
return false;
|
||||
return __bvec_gap_to_prev(q, bprv, offset);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
void blk_flush_integrity(void);
|
||||
bool __bio_integrity_endio(struct bio *);
|
||||
@@ -158,7 +194,38 @@ static inline bool bio_integrity_endio(struct bio *bio)
|
||||
return __bio_integrity_endio(bio);
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
|
||||
static inline bool integrity_req_gap_back_merge(struct request *req,
|
||||
struct bio *next)
|
||||
{
|
||||
struct bio_integrity_payload *bip = bio_integrity(req->bio);
|
||||
struct bio_integrity_payload *bip_next = bio_integrity(next);
|
||||
|
||||
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
|
||||
bip_next->bip_vec[0].bv_offset);
|
||||
}
|
||||
|
||||
static inline bool integrity_req_gap_front_merge(struct request *req,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
|
||||
|
||||
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
|
||||
bip_next->bip_vec[0].bv_offset);
|
||||
}
|
||||
#else /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
static inline bool integrity_req_gap_back_merge(struct request *req,
|
||||
struct bio *next)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool integrity_req_gap_front_merge(struct request *req,
|
||||
struct bio *bio)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void blk_flush_integrity(void)
|
||||
{
|
||||
}
|
||||
@@ -166,7 +233,7 @@ static inline bool bio_integrity_endio(struct bio *bio)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
|
||||
void blk_timeout_work(struct work_struct *work);
|
||||
unsigned long blk_rq_timeout(unsigned long timeout);
|
||||
|
@@ -31,6 +31,24 @@
|
||||
static struct bio_set bounce_bio_set, bounce_bio_split;
|
||||
static mempool_t page_pool, isa_page_pool;
|
||||
|
||||
static void init_bounce_bioset(void)
|
||||
{
|
||||
static bool bounce_bs_setup;
|
||||
int ret;
|
||||
|
||||
if (bounce_bs_setup)
|
||||
return;
|
||||
|
||||
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
|
||||
BUG_ON(ret);
|
||||
if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
|
||||
BUG_ON(1);
|
||||
|
||||
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
BUG_ON(ret);
|
||||
bounce_bs_setup = true;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_HIGHMEM)
|
||||
static __init int init_emergency_pool(void)
|
||||
{
|
||||
@@ -44,14 +62,7 @@ static __init int init_emergency_pool(void)
|
||||
BUG_ON(ret);
|
||||
pr_info("pool size: %d pages\n", POOL_SIZE);
|
||||
|
||||
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
|
||||
BUG_ON(ret);
|
||||
if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
|
||||
BUG_ON(1);
|
||||
|
||||
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
BUG_ON(ret);
|
||||
|
||||
init_bounce_bioset();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -86,6 +97,8 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
|
||||
return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(isa_mutex);
|
||||
|
||||
/*
|
||||
* gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
|
||||
* as the max address, so check if the pool has already been created.
|
||||
@@ -94,14 +107,20 @@ int init_emergency_isa_pool(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (mempool_initialized(&isa_page_pool))
|
||||
mutex_lock(&isa_mutex);
|
||||
|
||||
if (mempool_initialized(&isa_page_pool)) {
|
||||
mutex_unlock(&isa_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
|
||||
mempool_free_pages, (void *) 0);
|
||||
BUG_ON(ret);
|
||||
|
||||
pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
|
||||
init_bounce_bioset();
|
||||
mutex_unlock(&isa_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -257,7 +276,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
|
||||
}
|
||||
}
|
||||
|
||||
bio_clone_blkcg_association(bio, bio_src);
|
||||
bio_clone_blkg_association(bio, bio_src);
|
||||
|
||||
blkcg_bio_issue_init(bio);
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
@@ -1644,14 +1644,20 @@ static void cfq_pd_offline(struct blkg_policy_data *pd)
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IOPRIO_BE_NR; i++) {
|
||||
if (cfqg->async_cfqq[0][i])
|
||||
if (cfqg->async_cfqq[0][i]) {
|
||||
cfq_put_queue(cfqg->async_cfqq[0][i]);
|
||||
if (cfqg->async_cfqq[1][i])
|
||||
cfqg->async_cfqq[0][i] = NULL;
|
||||
}
|
||||
if (cfqg->async_cfqq[1][i]) {
|
||||
cfq_put_queue(cfqg->async_cfqq[1][i]);
|
||||
cfqg->async_cfqq[1][i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (cfqg->async_idle_cfqq)
|
||||
if (cfqg->async_idle_cfqq) {
|
||||
cfq_put_queue(cfqg->async_idle_cfqq);
|
||||
cfqg->async_idle_cfqq = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* @blkg is going offline and will be ignored by
|
||||
@@ -3753,7 +3759,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
|
||||
uint64_t serial_nr;
|
||||
|
||||
rcu_read_lock();
|
||||
serial_nr = bio_blkcg(bio)->css.serial_nr;
|
||||
serial_nr = __bio_blkcg(bio)->css.serial_nr;
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
@@ -3818,7 +3824,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
|
||||
struct cfq_group *cfqg;
|
||||
|
||||
rcu_read_lock();
|
||||
cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
|
||||
cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio));
|
||||
if (!cfqg) {
|
||||
cfqq = &cfqd->oom_cfqq;
|
||||
goto out;
|
||||
|
@@ -41,6 +41,7 @@
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-pm.h"
|
||||
#include "blk-wbt.h"
|
||||
|
||||
static DEFINE_SPINLOCK(elv_list_lock);
|
||||
@@ -557,27 +558,6 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
|
||||
e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM
|
||||
static void blk_pm_requeue_request(struct request *rq)
|
||||
{
|
||||
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
|
||||
rq->q->nr_pending--;
|
||||
}
|
||||
|
||||
static void blk_pm_add_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
|
||||
(q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
|
||||
pm_request_resume(q->dev);
|
||||
}
|
||||
#else
|
||||
static inline void blk_pm_requeue_request(struct request *rq) {}
|
||||
static inline void blk_pm_add_request(struct request_queue *q,
|
||||
struct request *rq)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void elv_requeue_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
/*
|
||||
|
@@ -567,7 +567,8 @@ static int exact_lock(dev_t devt, void *data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void register_disk(struct device *parent, struct gendisk *disk)
|
||||
static void register_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups)
|
||||
{
|
||||
struct device *ddev = disk_to_dev(disk);
|
||||
struct block_device *bdev;
|
||||
@@ -582,6 +583,10 @@ static void register_disk(struct device *parent, struct gendisk *disk)
|
||||
/* delay uevents, until we scanned partition table */
|
||||
dev_set_uevent_suppress(ddev, 1);
|
||||
|
||||
if (groups) {
|
||||
WARN_ON(ddev->groups);
|
||||
ddev->groups = groups;
|
||||
}
|
||||
if (device_add(ddev))
|
||||
return;
|
||||
if (!sysfs_deprecated) {
|
||||
@@ -647,6 +652,7 @@ exit:
|
||||
* __device_add_disk - add disk information to kernel list
|
||||
* @parent: parent device for the disk
|
||||
* @disk: per-device partitioning information
|
||||
* @groups: Additional per-device sysfs groups
|
||||
* @register_queue: register the queue if set to true
|
||||
*
|
||||
* This function registers the partitioning information in @disk
|
||||
@@ -655,6 +661,7 @@ exit:
|
||||
* FIXME: error handling
|
||||
*/
|
||||
static void __device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups,
|
||||
bool register_queue)
|
||||
{
|
||||
dev_t devt;
|
||||
@@ -698,7 +705,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
blk_register_region(disk_devt(disk), disk->minors, NULL,
|
||||
exact_match, exact_lock, disk);
|
||||
}
|
||||
register_disk(parent, disk);
|
||||
register_disk(parent, disk, groups);
|
||||
if (register_queue)
|
||||
blk_register_queue(disk);
|
||||
|
||||
@@ -712,15 +719,17 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
blk_integrity_add(disk);
|
||||
}
|
||||
|
||||
void device_add_disk(struct device *parent, struct gendisk *disk)
|
||||
void device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups)
|
||||
|
||||
{
|
||||
__device_add_disk(parent, disk, true);
|
||||
__device_add_disk(parent, disk, groups, true);
|
||||
}
|
||||
EXPORT_SYMBOL(device_add_disk);
|
||||
|
||||
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
|
||||
{
|
||||
__device_add_disk(parent, disk, false);
|
||||
__device_add_disk(parent, disk, NULL, false);
|
||||
}
|
||||
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
|
||||
|
||||
|
@@ -29,19 +29,30 @@
|
||||
#include "blk-mq-debugfs.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-stat.h"
|
||||
|
||||
/* Scheduling domains. */
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/kyber.h>
|
||||
|
||||
/*
|
||||
* Scheduling domains: the device is divided into multiple domains based on the
|
||||
* request type.
|
||||
*/
|
||||
enum {
|
||||
KYBER_READ,
|
||||
KYBER_SYNC_WRITE,
|
||||
KYBER_OTHER, /* Async writes, discard, etc. */
|
||||
KYBER_WRITE,
|
||||
KYBER_DISCARD,
|
||||
KYBER_OTHER,
|
||||
KYBER_NUM_DOMAINS,
|
||||
};
|
||||
|
||||
enum {
|
||||
KYBER_MIN_DEPTH = 256,
|
||||
static const char *kyber_domain_names[] = {
|
||||
[KYBER_READ] = "READ",
|
||||
[KYBER_WRITE] = "WRITE",
|
||||
[KYBER_DISCARD] = "DISCARD",
|
||||
[KYBER_OTHER] = "OTHER",
|
||||
};
|
||||
|
||||
enum {
|
||||
/*
|
||||
* In order to prevent starvation of synchronous requests by a flood of
|
||||
* asynchronous requests, we reserve 25% of requests for synchronous
|
||||
@@ -51,25 +62,87 @@ enum {
|
||||
};
|
||||
|
||||
/*
|
||||
* Initial device-wide depths for each scheduling domain.
|
||||
* Maximum device-wide depth for each scheduling domain.
|
||||
*
|
||||
* Even for fast devices with lots of tags like NVMe, you can saturate
|
||||
* the device with only a fraction of the maximum possible queue depth.
|
||||
* So, we cap these to a reasonable value.
|
||||
* Even for fast devices with lots of tags like NVMe, you can saturate the
|
||||
* device with only a fraction of the maximum possible queue depth. So, we cap
|
||||
* these to a reasonable value.
|
||||
*/
|
||||
static const unsigned int kyber_depth[] = {
|
||||
[KYBER_READ] = 256,
|
||||
[KYBER_SYNC_WRITE] = 128,
|
||||
[KYBER_OTHER] = 64,
|
||||
[KYBER_WRITE] = 128,
|
||||
[KYBER_DISCARD] = 64,
|
||||
[KYBER_OTHER] = 16,
|
||||
};
|
||||
|
||||
/*
|
||||
* Scheduling domain batch sizes. We favor reads.
|
||||
* Default latency targets for each scheduling domain.
|
||||
*/
|
||||
static const u64 kyber_latency_targets[] = {
|
||||
[KYBER_READ] = 2ULL * NSEC_PER_MSEC,
|
||||
[KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
|
||||
[KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
|
||||
};
|
||||
|
||||
/*
|
||||
* Batch size (number of requests we'll dispatch in a row) for each scheduling
|
||||
* domain.
|
||||
*/
|
||||
static const unsigned int kyber_batch_size[] = {
|
||||
[KYBER_READ] = 16,
|
||||
[KYBER_SYNC_WRITE] = 8,
|
||||
[KYBER_OTHER] = 8,
|
||||
[KYBER_WRITE] = 8,
|
||||
[KYBER_DISCARD] = 1,
|
||||
[KYBER_OTHER] = 1,
|
||||
};
|
||||
|
||||
/*
|
||||
* Requests latencies are recorded in a histogram with buckets defined relative
|
||||
* to the target latency:
|
||||
*
|
||||
* <= 1/4 * target latency
|
||||
* <= 1/2 * target latency
|
||||
* <= 3/4 * target latency
|
||||
* <= target latency
|
||||
* <= 1 1/4 * target latency
|
||||
* <= 1 1/2 * target latency
|
||||
* <= 1 3/4 * target latency
|
||||
* > 1 3/4 * target latency
|
||||
*/
|
||||
enum {
|
||||
/*
|
||||
* The width of the latency histogram buckets is
|
||||
* 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
|
||||
*/
|
||||
KYBER_LATENCY_SHIFT = 2,
|
||||
/*
|
||||
* The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
|
||||
* thus, "good".
|
||||
*/
|
||||
KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
|
||||
/* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
|
||||
KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
|
||||
};
|
||||
|
||||
/*
|
||||
* We measure both the total latency and the I/O latency (i.e., latency after
|
||||
* submitting to the device).
|
||||
*/
|
||||
enum {
|
||||
KYBER_TOTAL_LATENCY,
|
||||
KYBER_IO_LATENCY,
|
||||
};
|
||||
|
||||
static const char *kyber_latency_type_names[] = {
|
||||
[KYBER_TOTAL_LATENCY] = "total",
|
||||
[KYBER_IO_LATENCY] = "I/O",
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-cpu latency histograms: total latency and I/O latency for each scheduling
|
||||
* domain except for KYBER_OTHER.
|
||||
*/
|
||||
struct kyber_cpu_latency {
|
||||
atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -88,12 +161,9 @@ struct kyber_ctx_queue {
|
||||
struct kyber_queue_data {
|
||||
struct request_queue *q;
|
||||
|
||||
struct blk_stat_callback *cb;
|
||||
|
||||
/*
|
||||
* The device is divided into multiple scheduling domains based on the
|
||||
* request type. Each domain has a fixed number of in-flight requests of
|
||||
* that type device-wide, limited by these tokens.
|
||||
* Each scheduling domain has a limited number of in-flight requests
|
||||
* device-wide, limited by these tokens.
|
||||
*/
|
||||
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
|
||||
|
||||
@@ -103,8 +173,19 @@ struct kyber_queue_data {
|
||||
*/
|
||||
unsigned int async_depth;
|
||||
|
||||
struct kyber_cpu_latency __percpu *cpu_latency;
|
||||
|
||||
/* Timer for stats aggregation and adjusting domain tokens. */
|
||||
struct timer_list timer;
|
||||
|
||||
unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
|
||||
|
||||
unsigned long latency_timeout[KYBER_OTHER];
|
||||
|
||||
int domain_p99[KYBER_OTHER];
|
||||
|
||||
/* Target latencies in nanoseconds. */
|
||||
u64 read_lat_nsec, write_lat_nsec;
|
||||
u64 latency_targets[KYBER_OTHER];
|
||||
};
|
||||
|
||||
struct kyber_hctx_data {
|
||||
@@ -124,233 +205,219 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
|
||||
|
||||
static unsigned int kyber_sched_domain(unsigned int op)
|
||||
{
|
||||
if ((op & REQ_OP_MASK) == REQ_OP_READ)
|
||||
switch (op & REQ_OP_MASK) {
|
||||
case REQ_OP_READ:
|
||||
return KYBER_READ;
|
||||
else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
|
||||
return KYBER_SYNC_WRITE;
|
||||
else
|
||||
case REQ_OP_WRITE:
|
||||
return KYBER_WRITE;
|
||||
case REQ_OP_DISCARD:
|
||||
return KYBER_DISCARD;
|
||||
default:
|
||||
return KYBER_OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
enum {
|
||||
NONE = 0,
|
||||
GOOD = 1,
|
||||
GREAT = 2,
|
||||
BAD = -1,
|
||||
AWFUL = -2,
|
||||
};
|
||||
|
||||
#define IS_GOOD(status) ((status) > 0)
|
||||
#define IS_BAD(status) ((status) < 0)
|
||||
|
||||
static int kyber_lat_status(struct blk_stat_callback *cb,
|
||||
unsigned int sched_domain, u64 target)
|
||||
static void flush_latency_buckets(struct kyber_queue_data *kqd,
|
||||
struct kyber_cpu_latency *cpu_latency,
|
||||
unsigned int sched_domain, unsigned int type)
|
||||
{
|
||||
u64 latency;
|
||||
unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
|
||||
atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
|
||||
unsigned int bucket;
|
||||
|
||||
if (!cb->stat[sched_domain].nr_samples)
|
||||
return NONE;
|
||||
|
||||
latency = cb->stat[sched_domain].mean;
|
||||
if (latency >= 2 * target)
|
||||
return AWFUL;
|
||||
else if (latency > target)
|
||||
return BAD;
|
||||
else if (latency <= target / 2)
|
||||
return GREAT;
|
||||
else /* (latency <= target) */
|
||||
return GOOD;
|
||||
for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
|
||||
buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust the read or synchronous write depth given the status of reads and
|
||||
* writes. The goal is that the latencies of the two domains are fair (i.e., if
|
||||
* one is good, then the other is good).
|
||||
* Calculate the histogram bucket with the given percentile rank, or -1 if there
|
||||
* aren't enough samples yet.
|
||||
*/
|
||||
static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
|
||||
unsigned int sched_domain, int this_status,
|
||||
int other_status)
|
||||
static int calculate_percentile(struct kyber_queue_data *kqd,
|
||||
unsigned int sched_domain, unsigned int type,
|
||||
unsigned int percentile)
|
||||
{
|
||||
unsigned int orig_depth, depth;
|
||||
unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
|
||||
unsigned int bucket, samples = 0, percentile_samples;
|
||||
|
||||
for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
|
||||
samples += buckets[bucket];
|
||||
|
||||
if (!samples)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* If this domain had no samples, or reads and writes are both good or
|
||||
* both bad, don't adjust the depth.
|
||||
* We do the calculation once we have 500 samples or one second passes
|
||||
* since the first sample was recorded, whichever comes first.
|
||||
*/
|
||||
if (this_status == NONE ||
|
||||
(IS_GOOD(this_status) && IS_GOOD(other_status)) ||
|
||||
(IS_BAD(this_status) && IS_BAD(other_status)))
|
||||
return;
|
||||
|
||||
orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
|
||||
|
||||
if (other_status == NONE) {
|
||||
depth++;
|
||||
} else {
|
||||
switch (this_status) {
|
||||
case GOOD:
|
||||
if (other_status == AWFUL)
|
||||
depth -= max(depth / 4, 1U);
|
||||
else
|
||||
depth -= max(depth / 8, 1U);
|
||||
break;
|
||||
case GREAT:
|
||||
if (other_status == AWFUL)
|
||||
depth /= 2;
|
||||
else
|
||||
depth -= max(depth / 4, 1U);
|
||||
break;
|
||||
case BAD:
|
||||
depth++;
|
||||
break;
|
||||
case AWFUL:
|
||||
if (other_status == GREAT)
|
||||
depth += 2;
|
||||
else
|
||||
depth++;
|
||||
break;
|
||||
}
|
||||
if (!kqd->latency_timeout[sched_domain])
|
||||
kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
|
||||
if (samples < 500 &&
|
||||
time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
|
||||
return -1;
|
||||
}
|
||||
kqd->latency_timeout[sched_domain] = 0;
|
||||
|
||||
percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
|
||||
for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
|
||||
if (buckets[bucket] >= percentile_samples)
|
||||
break;
|
||||
percentile_samples -= buckets[bucket];
|
||||
}
|
||||
memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
|
||||
|
||||
trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
|
||||
kyber_latency_type_names[type], percentile,
|
||||
bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
|
||||
|
||||
return bucket;
|
||||
}
|
||||
|
||||
static void kyber_resize_domain(struct kyber_queue_data *kqd,
|
||||
unsigned int sched_domain, unsigned int depth)
|
||||
{
|
||||
depth = clamp(depth, 1U, kyber_depth[sched_domain]);
|
||||
if (depth != orig_depth)
|
||||
if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
|
||||
sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
|
||||
trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
|
||||
depth);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust the depth of other requests given the status of reads and synchronous
|
||||
* writes. As long as either domain is doing fine, we don't throttle, but if
|
||||
* both domains are doing badly, we throttle heavily.
|
||||
*/
|
||||
static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
|
||||
int read_status, int write_status,
|
||||
bool have_samples)
|
||||
static void kyber_timer_fn(struct timer_list *t)
|
||||
{
|
||||
unsigned int orig_depth, depth;
|
||||
int status;
|
||||
struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
|
||||
unsigned int sched_domain;
|
||||
int cpu;
|
||||
bool bad = false;
|
||||
|
||||
orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
|
||||
/* Sum all of the per-cpu latency histograms. */
|
||||
for_each_online_cpu(cpu) {
|
||||
struct kyber_cpu_latency *cpu_latency;
|
||||
|
||||
if (read_status == NONE && write_status == NONE) {
|
||||
depth += 2;
|
||||
} else if (have_samples) {
|
||||
if (read_status == NONE)
|
||||
status = write_status;
|
||||
else if (write_status == NONE)
|
||||
status = read_status;
|
||||
else
|
||||
status = max(read_status, write_status);
|
||||
switch (status) {
|
||||
case GREAT:
|
||||
depth += 2;
|
||||
break;
|
||||
case GOOD:
|
||||
depth++;
|
||||
break;
|
||||
case BAD:
|
||||
depth -= max(depth / 4, 1U);
|
||||
break;
|
||||
case AWFUL:
|
||||
depth /= 2;
|
||||
break;
|
||||
cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
|
||||
for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
|
||||
flush_latency_buckets(kqd, cpu_latency, sched_domain,
|
||||
KYBER_TOTAL_LATENCY);
|
||||
flush_latency_buckets(kqd, cpu_latency, sched_domain,
|
||||
KYBER_IO_LATENCY);
|
||||
}
|
||||
}
|
||||
|
||||
depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
|
||||
if (depth != orig_depth)
|
||||
sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
|
||||
}
|
||||
/*
|
||||
* Check if any domains have a high I/O latency, which might indicate
|
||||
* congestion in the device. Note that we use the p90; we don't want to
|
||||
* be too sensitive to outliers here.
|
||||
*/
|
||||
for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
|
||||
int p90;
|
||||
|
||||
/*
|
||||
* Apply heuristics for limiting queue depths based on gathered latency
|
||||
* statistics.
|
||||
*/
|
||||
static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
|
||||
{
|
||||
struct kyber_queue_data *kqd = cb->data;
|
||||
int read_status, write_status;
|
||||
|
||||
read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
|
||||
write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
|
||||
|
||||
kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
|
||||
kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
|
||||
kyber_adjust_other_depth(kqd, read_status, write_status,
|
||||
cb->stat[KYBER_OTHER].nr_samples != 0);
|
||||
p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
|
||||
90);
|
||||
if (p90 >= KYBER_GOOD_BUCKETS)
|
||||
bad = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Continue monitoring latencies if we aren't hitting the targets or
|
||||
* we're still throttling other requests.
|
||||
* Adjust the scheduling domain depths. If we determined that there was
|
||||
* congestion, we throttle all domains with good latencies. Either way,
|
||||
* we ease up on throttling domains with bad latencies.
|
||||
*/
|
||||
if (!blk_stat_is_active(kqd->cb) &&
|
||||
((IS_BAD(read_status) || IS_BAD(write_status) ||
|
||||
kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
|
||||
blk_stat_activate_msecs(kqd->cb, 100);
|
||||
for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
|
||||
unsigned int orig_depth, depth;
|
||||
int p99;
|
||||
|
||||
p99 = calculate_percentile(kqd, sched_domain,
|
||||
KYBER_TOTAL_LATENCY, 99);
|
||||
/*
|
||||
* This is kind of subtle: different domains will not
|
||||
* necessarily have enough samples to calculate the latency
|
||||
* percentiles during the same window, so we have to remember
|
||||
* the p99 for the next time we observe congestion; once we do,
|
||||
* we don't want to throttle again until we get more data, so we
|
||||
* reset it to -1.
|
||||
*/
|
||||
if (bad) {
|
||||
if (p99 < 0)
|
||||
p99 = kqd->domain_p99[sched_domain];
|
||||
kqd->domain_p99[sched_domain] = -1;
|
||||
} else if (p99 >= 0) {
|
||||
kqd->domain_p99[sched_domain] = p99;
|
||||
}
|
||||
if (p99 < 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If this domain has bad latency, throttle less. Otherwise,
|
||||
* throttle more iff we determined that there is congestion.
|
||||
*
|
||||
* The new depth is scaled linearly with the p99 latency vs the
|
||||
* latency target. E.g., if the p99 is 3/4 of the target, then
|
||||
* we throttle down to 3/4 of the current depth, and if the p99
|
||||
* is 2x the target, then we double the depth.
|
||||
*/
|
||||
if (bad || p99 >= KYBER_GOOD_BUCKETS) {
|
||||
orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
|
||||
depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
|
||||
kyber_resize_domain(kqd, sched_domain, depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
|
||||
static unsigned int kyber_sched_tags_shift(struct request_queue *q)
|
||||
{
|
||||
/*
|
||||
* All of the hardware queues have the same depth, so we can just grab
|
||||
* the shift of the first one.
|
||||
*/
|
||||
return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
|
||||
}
|
||||
|
||||
static int kyber_bucket_fn(const struct request *rq)
|
||||
{
|
||||
return kyber_sched_domain(rq->cmd_flags);
|
||||
return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
|
||||
}
|
||||
|
||||
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
|
||||
{
|
||||
struct kyber_queue_data *kqd;
|
||||
unsigned int max_tokens;
|
||||
unsigned int shift;
|
||||
int ret = -ENOMEM;
|
||||
int i;
|
||||
|
||||
kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
|
||||
kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
|
||||
if (!kqd)
|
||||
goto err;
|
||||
|
||||
kqd->q = q;
|
||||
|
||||
kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
|
||||
KYBER_NUM_DOMAINS, kqd);
|
||||
if (!kqd->cb)
|
||||
kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
|
||||
GFP_KERNEL | __GFP_ZERO);
|
||||
if (!kqd->cpu_latency)
|
||||
goto err_kqd;
|
||||
|
||||
/*
|
||||
* The maximum number of tokens for any scheduling domain is at least
|
||||
* the queue depth of a single hardware queue. If the hardware doesn't
|
||||
* have many tags, still provide a reasonable number.
|
||||
*/
|
||||
max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
|
||||
KYBER_MIN_DEPTH);
|
||||
timer_setup(&kqd->timer, kyber_timer_fn, 0);
|
||||
|
||||
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
|
||||
WARN_ON(!kyber_depth[i]);
|
||||
WARN_ON(!kyber_batch_size[i]);
|
||||
ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
|
||||
max_tokens, -1, false, GFP_KERNEL,
|
||||
q->node);
|
||||
kyber_depth[i], -1, false,
|
||||
GFP_KERNEL, q->node);
|
||||
if (ret) {
|
||||
while (--i >= 0)
|
||||
sbitmap_queue_free(&kqd->domain_tokens[i]);
|
||||
goto err_cb;
|
||||
goto err_buckets;
|
||||
}
|
||||
sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
|
||||
}
|
||||
|
||||
shift = kyber_sched_tags_shift(kqd);
|
||||
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
|
||||
for (i = 0; i < KYBER_OTHER; i++) {
|
||||
kqd->domain_p99[i] = -1;
|
||||
kqd->latency_targets[i] = kyber_latency_targets[i];
|
||||
}
|
||||
|
||||
kqd->read_lat_nsec = 2000000ULL;
|
||||
kqd->write_lat_nsec = 10000000ULL;
|
||||
shift = kyber_sched_tags_shift(q);
|
||||
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
|
||||
|
||||
return kqd;
|
||||
|
||||
err_cb:
|
||||
blk_stat_free_callback(kqd->cb);
|
||||
err_buckets:
|
||||
free_percpu(kqd->cpu_latency);
|
||||
err_kqd:
|
||||
kfree(kqd);
|
||||
err:
|
||||
@@ -372,25 +439,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
return PTR_ERR(kqd);
|
||||
}
|
||||
|
||||
blk_stat_enable_accounting(q);
|
||||
|
||||
eq->elevator_data = kqd;
|
||||
q->elevator = eq;
|
||||
|
||||
blk_stat_add_callback(q, kqd->cb);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kyber_exit_sched(struct elevator_queue *e)
|
||||
{
|
||||
struct kyber_queue_data *kqd = e->elevator_data;
|
||||
struct request_queue *q = kqd->q;
|
||||
int i;
|
||||
|
||||
blk_stat_remove_callback(q, kqd->cb);
|
||||
del_timer_sync(&kqd->timer);
|
||||
|
||||
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
|
||||
sbitmap_queue_free(&kqd->domain_tokens[i]);
|
||||
blk_stat_free_callback(kqd->cb);
|
||||
free_percpu(kqd->cpu_latency);
|
||||
kfree(kqd);
|
||||
}
|
||||
|
||||
@@ -558,41 +624,44 @@ static void kyber_finish_request(struct request *rq)
|
||||
rq_clear_domain_token(kqd, rq);
|
||||
}
|
||||
|
||||
static void kyber_completed_request(struct request *rq)
|
||||
static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
|
||||
unsigned int sched_domain, unsigned int type,
|
||||
u64 target, u64 latency)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct kyber_queue_data *kqd = q->elevator->elevator_data;
|
||||
unsigned int sched_domain;
|
||||
u64 now, latency, target;
|
||||
unsigned int bucket;
|
||||
u64 divisor;
|
||||
|
||||
/*
|
||||
* Check if this request met our latency goal. If not, quickly gather
|
||||
* some statistics and start throttling.
|
||||
*/
|
||||
sched_domain = kyber_sched_domain(rq->cmd_flags);
|
||||
switch (sched_domain) {
|
||||
case KYBER_READ:
|
||||
target = kqd->read_lat_nsec;
|
||||
break;
|
||||
case KYBER_SYNC_WRITE:
|
||||
target = kqd->write_lat_nsec;
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
if (latency > 0) {
|
||||
divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
|
||||
bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
|
||||
KYBER_LATENCY_BUCKETS - 1);
|
||||
} else {
|
||||
bucket = 0;
|
||||
}
|
||||
|
||||
/* If we are already monitoring latencies, don't check again. */
|
||||
if (blk_stat_is_active(kqd->cb))
|
||||
atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
|
||||
}
|
||||
|
||||
static void kyber_completed_request(struct request *rq, u64 now)
|
||||
{
|
||||
struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
|
||||
struct kyber_cpu_latency *cpu_latency;
|
||||
unsigned int sched_domain;
|
||||
u64 target;
|
||||
|
||||
sched_domain = kyber_sched_domain(rq->cmd_flags);
|
||||
if (sched_domain == KYBER_OTHER)
|
||||
return;
|
||||
|
||||
now = ktime_get_ns();
|
||||
if (now < rq->io_start_time_ns)
|
||||
return;
|
||||
cpu_latency = get_cpu_ptr(kqd->cpu_latency);
|
||||
target = kqd->latency_targets[sched_domain];
|
||||
add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
|
||||
target, now - rq->start_time_ns);
|
||||
add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
|
||||
now - rq->io_start_time_ns);
|
||||
put_cpu_ptr(kqd->cpu_latency);
|
||||
|
||||
latency = now - rq->io_start_time_ns;
|
||||
|
||||
if (latency > target)
|
||||
blk_stat_activate_msecs(kqd->cb, 10);
|
||||
timer_reduce(&kqd->timer, jiffies + HZ / 10);
|
||||
}
|
||||
|
||||
struct flush_kcq_data {
|
||||
@@ -713,6 +782,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
|
||||
rq_set_domain_token(rq, nr);
|
||||
list_del_init(&rq->queuelist);
|
||||
return rq;
|
||||
} else {
|
||||
trace_kyber_throttled(kqd->q,
|
||||
kyber_domain_names[khd->cur_domain]);
|
||||
}
|
||||
} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
|
||||
nr = kyber_get_domain_token(kqd, khd, hctx);
|
||||
@@ -723,6 +795,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
|
||||
rq_set_domain_token(rq, nr);
|
||||
list_del_init(&rq->queuelist);
|
||||
return rq;
|
||||
} else {
|
||||
trace_kyber_throttled(kqd->q,
|
||||
kyber_domain_names[khd->cur_domain]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -790,17 +865,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
|
||||
return false;
|
||||
}
|
||||
|
||||
#define KYBER_LAT_SHOW_STORE(op) \
|
||||
static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
|
||||
char *page) \
|
||||
#define KYBER_LAT_SHOW_STORE(domain, name) \
|
||||
static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
|
||||
char *page) \
|
||||
{ \
|
||||
struct kyber_queue_data *kqd = e->elevator_data; \
|
||||
\
|
||||
return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
|
||||
return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
|
||||
} \
|
||||
\
|
||||
static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
|
||||
const char *page, size_t count) \
|
||||
static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
|
||||
const char *page, size_t count) \
|
||||
{ \
|
||||
struct kyber_queue_data *kqd = e->elevator_data; \
|
||||
unsigned long long nsec; \
|
||||
@@ -810,12 +885,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
|
||||
if (ret) \
|
||||
return ret; \
|
||||
\
|
||||
kqd->op##_lat_nsec = nsec; \
|
||||
kqd->latency_targets[domain] = nsec; \
|
||||
\
|
||||
return count; \
|
||||
}
|
||||
KYBER_LAT_SHOW_STORE(read);
|
||||
KYBER_LAT_SHOW_STORE(write);
|
||||
KYBER_LAT_SHOW_STORE(KYBER_READ, read);
|
||||
KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
|
||||
#undef KYBER_LAT_SHOW_STORE
|
||||
|
||||
#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
|
||||
@@ -882,7 +957,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
|
||||
return 0; \
|
||||
}
|
||||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
|
||||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
|
||||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
|
||||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
|
||||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
|
||||
#undef KYBER_DEBUGFS_DOMAIN_ATTRS
|
||||
|
||||
@@ -900,20 +976,7 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
struct kyber_hctx_data *khd = hctx->sched_data;
|
||||
|
||||
switch (khd->cur_domain) {
|
||||
case KYBER_READ:
|
||||
seq_puts(m, "READ\n");
|
||||
break;
|
||||
case KYBER_SYNC_WRITE:
|
||||
seq_puts(m, "SYNC_WRITE\n");
|
||||
break;
|
||||
case KYBER_OTHER:
|
||||
seq_puts(m, "OTHER\n");
|
||||
break;
|
||||
default:
|
||||
seq_printf(m, "%u\n", khd->cur_domain);
|
||||
break;
|
||||
}
|
||||
seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -930,7 +993,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
|
||||
{#name "_tokens", 0400, kyber_##name##_tokens_show}
|
||||
static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(read),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(write),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(discard),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(other),
|
||||
{"async_depth", 0400, kyber_async_depth_show},
|
||||
{},
|
||||
@@ -942,7 +1006,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
|
||||
{#name "_waiting", 0400, kyber_##name##_waiting_show}
|
||||
static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
|
||||
KYBER_HCTX_DOMAIN_ATTRS(read),
|
||||
KYBER_HCTX_DOMAIN_ATTRS(sync_write),
|
||||
KYBER_HCTX_DOMAIN_ATTRS(write),
|
||||
KYBER_HCTX_DOMAIN_ATTRS(discard),
|
||||
KYBER_HCTX_DOMAIN_ATTRS(other),
|
||||
{"cur_domain", 0400, kyber_cur_domain_show},
|
||||
{"batching", 0400, kyber_batching_show},
|
||||
|
Reference in New Issue
Block a user