Merge tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block/storage for 4.21. Larger than usual, it was a busy round with lots of goodies queued up. Most notable is the removal of the old IO stack, which has been a long time coming. No new features for a while, everything coming in this week has all been fixes for things that were previously merged. This contains: - Use atomic counters instead of semaphores for mtip32xx (Arnd) - Cleanup of the mtip32xx request setup (Christoph) - Fix for circular locking dependency in loop (Jan, Tetsuo) - bcache (Coly, Guoju, Shenghui) * Optimizations for writeback caching * Various fixes and improvements - nvme (Chaitanya, Christoph, Sagi, Jay, me, Keith) * host and target support for NVMe over TCP * Error log page support * Support for separate read/write/poll queues * Much improved polling * discard OOM fallback * Tracepoint improvements - lightnvm (Hans, Hua, Igor, Matias, Javier) * Igor added packed metadata to pblk. Now drives without metadata per LBA can be used as well. * Fix from Geert on uninitialized value on chunk metadata reads. * Fixes from Hans and Javier to pblk recovery and write path. * Fix from Hua Su to fix a race condition in the pblk recovery code. * Scan optimization added to pblk recovery from Zhoujie. * Small geometry cleanup from me. - Conversion of the last few drivers that used the legacy path to blk-mq (me) - Removal of legacy IO path in SCSI (me, Christoph) - Removal of legacy IO stack and schedulers (me) - Support for much better polling, now without interrupts at all. blk-mq adds support for multiple queue maps, which enables us to have a map per type. This in turn enables nvme to have separate completion queues for polling, which can then be interrupt-less. Also means we're ready for async polled IO, which is hopefully coming in the next release. - Killing of (now) unused block exports (Christoph) - Unification of the blk-rq-qos and blk-wbt wait handling (Josef) - Support for zoned testing with null_blk (Masato) - sx8 conversion to per-host tag sets (Christoph) - IO priority improvements (Damien) - mq-deadline zoned fix (Damien) - Ref count blkcg series (Dennis) - Lots of blk-mq improvements and speedups (me) - sbitmap scalability improvements (me) - Make core inflight IO accounting per-cpu (Mikulas) - Export timeout setting in sysfs (Weiping) - Cleanup the direct issue path (Jianchao) - Export blk-wbt internals in block debugfs for easier debugging (Ming) - Lots of other fixes and improvements" * tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block: (364 commits) kyber: use sbitmap add_wait_queue/list_del wait helpers sbitmap: add helpers for add/del wait queue handling block: save irq state in blkg_lookup_create() dm: don't reuse bio for flushes nvme-pci: trace SQ status on completions nvme-rdma: implement polling queue map nvme-fabrics: allow user to pass in nr_poll_queues nvme-fabrics: allow nvmf_connect_io_queue to poll nvme-core: optionally poll sync commands block: make request_to_qc_t public nvme-tcp: fix spelling mistake "attepmpt" -> "attempt" nvme-tcp: fix endianess annotations nvmet-tcp: fix endianess annotations nvme-pci: refactor nvme_poll_irqdisable to make sparse happy nvme-pci: only set nr_maps to 2 if poll queues are supported nvmet: use a macro for default error location nvmet: fix comparison of a u16 with -1 blk-mq: enable IO poll if .nr_queues of type poll > 0 blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() blk-mq: skip zero-queue maps in blk_mq_map_swqueue ...
This commit is contained in:
@@ -626,6 +626,20 @@ struct cache_set {
|
||||
/* Where in the btree gc currently is */
|
||||
struct bkey gc_done;
|
||||
|
||||
/*
|
||||
* For automatical garbage collection after writeback completed, this
|
||||
* varialbe is used as bit fields,
|
||||
* - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
|
||||
* - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
|
||||
* This is an optimization for following write request after writeback
|
||||
* finished, but read hit rate dropped due to clean data on cache is
|
||||
* discarded. Unless user explicitly sets it via sysfs, it won't be
|
||||
* enabled.
|
||||
*/
|
||||
#define BCH_ENABLE_AUTO_GC 1
|
||||
#define BCH_DO_AUTO_GC 2
|
||||
uint8_t gc_after_writeback;
|
||||
|
||||
/*
|
||||
* The allocation code needs gc_mark in struct bucket to be correct, but
|
||||
* it's not while a gc is in progress. Protected by bucket_lock.
|
||||
@@ -658,7 +672,11 @@ struct cache_set {
|
||||
|
||||
/*
|
||||
* A btree node on disk could have too many bsets for an iterator to fit
|
||||
* on the stack - have to dynamically allocate them
|
||||
* on the stack - have to dynamically allocate them.
|
||||
* bch_cache_set_alloc() will make sure the pool can allocate iterators
|
||||
* equipped with enough room that can host
|
||||
* (sb.bucket_size / sb.block_size)
|
||||
* btree_iter_sets, which is more than static MAX_BSETS.
|
||||
*/
|
||||
mempool_t fill_iter;
|
||||
|
||||
|
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
|
||||
struct bset *i = btree_bset_first(b);
|
||||
struct btree_iter *iter;
|
||||
|
||||
/*
|
||||
* c->fill_iter can allocate an iterator with more memory space
|
||||
* than static MAX_BSETS.
|
||||
* See the comment arount cache_set->fill_iter.
|
||||
*/
|
||||
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
|
||||
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
|
||||
iter->used = 0;
|
||||
|
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
|
||||
wake_up(&c->gc_wait);
|
||||
}
|
||||
|
||||
static inline void force_wake_up_gc(struct cache_set *c)
|
||||
{
|
||||
/*
|
||||
* Garbage collection thread only works when sectors_to_gc < 0,
|
||||
* calling wake_up_gc() won't start gc thread if sectors_to_gc is
|
||||
* not a nagetive value.
|
||||
* Therefore sectors_to_gc is set to -1 here, before waking up
|
||||
* gc thread by calling wake_up_gc(). Then gc_should_run() will
|
||||
* give a chance to permit gc thread to run. "Give a chance" means
|
||||
* before going into gc_should_run(), there is still possibility
|
||||
* that c->sectors_to_gc being set to other positive value. So
|
||||
* this routine won't 100% make sure gc thread will be woken up
|
||||
* to run.
|
||||
*/
|
||||
atomic_set(&c->sectors_to_gc, -1);
|
||||
wake_up_gc(c);
|
||||
}
|
||||
|
||||
#define MAP_DONE 0
|
||||
#define MAP_CONTINUE 1
|
||||
|
||||
|
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
|
||||
|
||||
void bch_debug_exit(void)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(bcache_debug))
|
||||
debugfs_remove_recursive(bcache_debug);
|
||||
debugfs_remove_recursive(bcache_debug);
|
||||
}
|
||||
|
||||
void __init bch_debug_init(void)
|
||||
|
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
|
||||
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
|
||||
bch_bio_map(bio, w->data);
|
||||
|
||||
trace_bcache_journal_write(bio);
|
||||
trace_bcache_journal_write(bio, w->data->keys);
|
||||
bio_list_add(&list, bio);
|
||||
|
||||
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
|
||||
|
@@ -311,11 +311,11 @@ err:
|
||||
* data is written it calls bch_journal, and after the keys have been added to
|
||||
* the next journal write they're inserted into the btree.
|
||||
*
|
||||
* It inserts the data in s->cache_bio; bi_sector is used for the key offset,
|
||||
* It inserts the data in op->bio; bi_sector is used for the key offset,
|
||||
* and op->inode is used for the key inode.
|
||||
*
|
||||
* If s->bypass is true, instead of inserting the data it invalidates the
|
||||
* region of the cache represented by s->cache_bio and op->inode.
|
||||
* If op->bypass is true, instead of inserting the data it invalidates the
|
||||
* region of the cache represented by op->bio and op->inode.
|
||||
*/
|
||||
void bch_data_insert(struct closure *cl)
|
||||
{
|
||||
|
@@ -25,8 +25,8 @@
|
||||
#include <linux/reboot.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
||||
unsigned int bch_cutoff_writeback;
|
||||
unsigned int bch_cutoff_writeback_sync;
|
||||
|
||||
static const char bcache_magic[] = {
|
||||
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
|
||||
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
|
||||
struct cache *ca;
|
||||
unsigned int i;
|
||||
|
||||
if (!IS_ERR_OR_NULL(c->debug))
|
||||
debugfs_remove(c->debug);
|
||||
debugfs_remove(c->debug);
|
||||
|
||||
bch_open_buckets_free(c);
|
||||
bch_btree_cache_free(c);
|
||||
@@ -2424,6 +2423,32 @@ static void bcache_exit(void)
|
||||
mutex_destroy(&bch_register_lock);
|
||||
}
|
||||
|
||||
/* Check and fixup module parameters */
|
||||
static void check_module_parameters(void)
|
||||
{
|
||||
if (bch_cutoff_writeback_sync == 0)
|
||||
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
|
||||
else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
|
||||
pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
|
||||
bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
|
||||
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
|
||||
}
|
||||
|
||||
if (bch_cutoff_writeback == 0)
|
||||
bch_cutoff_writeback = CUTOFF_WRITEBACK;
|
||||
else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
|
||||
pr_warn("set bch_cutoff_writeback (%u) to max value %u",
|
||||
bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
|
||||
bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
|
||||
}
|
||||
|
||||
if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
|
||||
pr_warn("set bch_cutoff_writeback (%u) to %u",
|
||||
bch_cutoff_writeback, bch_cutoff_writeback_sync);
|
||||
bch_cutoff_writeback = bch_cutoff_writeback_sync;
|
||||
}
|
||||
}
|
||||
|
||||
static int __init bcache_init(void)
|
||||
{
|
||||
static const struct attribute *files[] = {
|
||||
@@ -2432,6 +2457,8 @@ static int __init bcache_init(void)
|
||||
NULL
|
||||
};
|
||||
|
||||
check_module_parameters();
|
||||
|
||||
mutex_init(&bch_register_lock);
|
||||
init_waitqueue_head(&unregister_wait);
|
||||
register_reboot_notifier(&reboot);
|
||||
@@ -2468,5 +2495,18 @@ err:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Module hooks
|
||||
*/
|
||||
module_exit(bcache_exit);
|
||||
module_init(bcache_init);
|
||||
|
||||
module_param(bch_cutoff_writeback, uint, 0);
|
||||
MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
|
||||
|
||||
module_param(bch_cutoff_writeback_sync, uint, 0);
|
||||
MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
|
||||
|
||||
MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
|
||||
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
@@ -16,7 +16,7 @@
|
||||
#include <linux/sort.h>
|
||||
#include <linux/sched/clock.h>
|
||||
|
||||
/* Default is -1; we skip past it for struct cached_dev's cache mode */
|
||||
/* Default is 0 ("writethrough") */
|
||||
static const char * const bch_cache_modes[] = {
|
||||
"writethrough",
|
||||
"writeback",
|
||||
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
/* Default is -1; we skip past it for stop_when_cache_set_failed */
|
||||
/* Default is 0 ("auto") */
|
||||
static const char * const bch_stop_on_failure_modes[] = {
|
||||
"auto",
|
||||
"always",
|
||||
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
|
||||
read_attribute(writeback_keys_failed);
|
||||
read_attribute(io_errors);
|
||||
read_attribute(congested);
|
||||
read_attribute(cutoff_writeback);
|
||||
read_attribute(cutoff_writeback_sync);
|
||||
rw_attribute(congested_read_threshold_us);
|
||||
rw_attribute(congested_write_threshold_us);
|
||||
|
||||
@@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks);
|
||||
rw_attribute(cache_replacement_policy);
|
||||
rw_attribute(btree_shrinker_disabled);
|
||||
rw_attribute(copy_gc_enabled);
|
||||
rw_attribute(gc_after_writeback);
|
||||
rw_attribute(size);
|
||||
|
||||
static ssize_t bch_snprint_string_list(char *buf,
|
||||
@@ -264,7 +267,8 @@ STORE(__cached_dev)
|
||||
d_strtoul(writeback_running);
|
||||
d_strtoul(writeback_delay);
|
||||
|
||||
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
|
||||
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
|
||||
0, bch_cutoff_writeback);
|
||||
|
||||
if (attr == &sysfs_writeback_rate) {
|
||||
ssize_t ret;
|
||||
@@ -384,8 +388,25 @@ STORE(bch_cached_dev)
|
||||
mutex_lock(&bch_register_lock);
|
||||
size = __cached_dev_store(kobj, attr, buf, size);
|
||||
|
||||
if (attr == &sysfs_writeback_running)
|
||||
bch_writeback_queue(dc);
|
||||
if (attr == &sysfs_writeback_running) {
|
||||
/* dc->writeback_running changed in __cached_dev_store() */
|
||||
if (IS_ERR_OR_NULL(dc->writeback_thread)) {
|
||||
/*
|
||||
* reject setting it to 1 via sysfs if writeback
|
||||
* kthread is not created yet.
|
||||
*/
|
||||
if (dc->writeback_running) {
|
||||
dc->writeback_running = false;
|
||||
pr_err("%s: failed to run non-existent writeback thread",
|
||||
dc->disk.disk->disk_name);
|
||||
}
|
||||
} else
|
||||
/*
|
||||
* writeback kthread will check if dc->writeback_running
|
||||
* is true or false.
|
||||
*/
|
||||
bch_writeback_queue(dc);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_writeback_percent)
|
||||
if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
|
||||
@@ -668,6 +689,9 @@ SHOW(__bch_cache_set)
|
||||
sysfs_print(congested_write_threshold_us,
|
||||
c->congested_write_threshold_us);
|
||||
|
||||
sysfs_print(cutoff_writeback, bch_cutoff_writeback);
|
||||
sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
|
||||
|
||||
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
|
||||
sysfs_printf(verify, "%i", c->verify);
|
||||
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
|
||||
@@ -676,6 +700,7 @@ SHOW(__bch_cache_set)
|
||||
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
|
||||
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
|
||||
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
|
||||
sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
|
||||
sysfs_printf(io_disable, "%i",
|
||||
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
|
||||
|
||||
@@ -725,21 +750,8 @@ STORE(__bch_cache_set)
|
||||
bch_cache_accounting_clear(&c->accounting);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_trigger_gc) {
|
||||
/*
|
||||
* Garbage collection thread only works when sectors_to_gc < 0,
|
||||
* when users write to sysfs entry trigger_gc, most of time
|
||||
* they want to forcibly triger gargage collection. Here -1 is
|
||||
* set to c->sectors_to_gc, to make gc_should_run() give a
|
||||
* chance to permit gc thread to run. "give a chance" means
|
||||
* before going into gc_should_run(), there is still chance
|
||||
* that c->sectors_to_gc being set to other positive value. So
|
||||
* writing sysfs entry trigger_gc won't always make sure gc
|
||||
* thread takes effect.
|
||||
*/
|
||||
atomic_set(&c->sectors_to_gc, -1);
|
||||
wake_up_gc(c);
|
||||
}
|
||||
if (attr == &sysfs_trigger_gc)
|
||||
force_wake_up_gc(c);
|
||||
|
||||
if (attr == &sysfs_prune_cache) {
|
||||
struct shrink_control sc;
|
||||
@@ -789,6 +801,12 @@ STORE(__bch_cache_set)
|
||||
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
|
||||
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
|
||||
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
|
||||
/*
|
||||
* write gc_after_writeback here may overwrite an already set
|
||||
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be
|
||||
* set in next chance.
|
||||
*/
|
||||
sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
|
||||
|
||||
return size;
|
||||
}
|
||||
@@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = {
|
||||
&sysfs_gc_always_rewrite,
|
||||
&sysfs_btree_shrinker_disabled,
|
||||
&sysfs_copy_gc_enabled,
|
||||
&sysfs_gc_after_writeback,
|
||||
&sysfs_io_disable,
|
||||
&sysfs_cutoff_writeback,
|
||||
&sysfs_cutoff_writeback_sync,
|
||||
NULL
|
||||
};
|
||||
KTYPE(bch_cache_set_internal);
|
||||
|
@@ -17,6 +17,15 @@
|
||||
#include <linux/sched/clock.h>
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
static void update_gc_after_writeback(struct cache_set *c)
|
||||
{
|
||||
if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
|
||||
c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
|
||||
return;
|
||||
|
||||
c->gc_after_writeback |= BCH_DO_AUTO_GC;
|
||||
}
|
||||
|
||||
/* Rate limiting */
|
||||
static uint64_t __calc_target_rate(struct cached_dev *dc)
|
||||
{
|
||||
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
|
||||
if (!set_at_max_writeback_rate(c, dc)) {
|
||||
down_read(&dc->writeback_lock);
|
||||
__update_writeback_rate(dc);
|
||||
update_gc_after_writeback(c);
|
||||
up_read(&dc->writeback_lock);
|
||||
}
|
||||
}
|
||||
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
|
||||
up_write(&dc->writeback_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* When dirty data rate is high (e.g. 50%+), there might
|
||||
* be heavy buckets fragmentation after writeback
|
||||
* finished, which hurts following write performance.
|
||||
* If users really care about write performance they
|
||||
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
|
||||
* BCH_DO_AUTO_GC is set, garbage collection thread
|
||||
* will be wake up here. After moving gc, the shrunk
|
||||
* btree and discarded free buckets SSD space may be
|
||||
* helpful for following write requests.
|
||||
*/
|
||||
if (c->gc_after_writeback ==
|
||||
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
|
||||
c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
|
||||
force_wake_up_gc(c);
|
||||
}
|
||||
}
|
||||
|
||||
up_write(&dc->writeback_lock);
|
||||
@@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
|
||||
bch_keybuf_init(&dc->writeback_keys);
|
||||
|
||||
dc->writeback_metadata = true;
|
||||
dc->writeback_running = true;
|
||||
dc->writeback_running = false;
|
||||
dc->writeback_percent = 10;
|
||||
dc->writeback_delay = 30;
|
||||
atomic_long_set(&dc->writeback_rate.rate, 1024);
|
||||
@@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
|
||||
cached_dev_put(dc);
|
||||
return PTR_ERR(dc->writeback_thread);
|
||||
}
|
||||
dc->writeback_running = true;
|
||||
|
||||
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
|
||||
schedule_delayed_work(&dc->writeback_rate_update,
|
||||
|
@@ -5,12 +5,17 @@
|
||||
#define CUTOFF_WRITEBACK 40
|
||||
#define CUTOFF_WRITEBACK_SYNC 70
|
||||
|
||||
#define CUTOFF_WRITEBACK_MAX 70
|
||||
#define CUTOFF_WRITEBACK_SYNC_MAX 90
|
||||
|
||||
#define MAX_WRITEBACKS_IN_PASS 5
|
||||
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
|
||||
|
||||
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
|
||||
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
|
||||
|
||||
#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
|
||||
|
||||
/*
|
||||
* 14 (16384ths) is chosen here as something that each backing device
|
||||
* should be a reasonable fraction of the share, and not to blow up
|
||||
@@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
|
||||
}
|
||||
}
|
||||
|
||||
extern unsigned int bch_cutoff_writeback;
|
||||
extern unsigned int bch_cutoff_writeback_sync;
|
||||
|
||||
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
|
||||
unsigned int cache_mode, bool would_skip)
|
||||
{
|
||||
@@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
|
||||
|
||||
if (cache_mode != CACHE_MODE_WRITEBACK ||
|
||||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
|
||||
in_use > CUTOFF_WRITEBACK_SYNC)
|
||||
in_use > bch_cutoff_writeback_sync)
|
||||
return false;
|
||||
|
||||
if (dc->partial_stripes_expensive &&
|
||||
@@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
|
||||
|
||||
return (op_is_sync(bio->bi_opf) ||
|
||||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
|
||||
in_use <= CUTOFF_WRITEBACK);
|
||||
in_use <= bch_cutoff_writeback);
|
||||
}
|
||||
|
||||
static inline void bch_writeback_queue(struct cached_dev *dc)
|
||||
|
@@ -65,7 +65,6 @@ struct mapped_device {
|
||||
*/
|
||||
struct work_struct work;
|
||||
wait_queue_head_t wait;
|
||||
atomic_t pending[2];
|
||||
spinlock_t deferred_lock;
|
||||
struct bio_list deferred;
|
||||
|
||||
@@ -107,9 +106,6 @@ struct mapped_device {
|
||||
|
||||
struct block_device *bdev;
|
||||
|
||||
/* zero-length flush that will be cloned and submitted to targets */
|
||||
struct bio flush_bio;
|
||||
|
||||
struct dm_stats stats;
|
||||
|
||||
/* for blk-mq request-based DM support */
|
||||
@@ -119,7 +115,6 @@ struct mapped_device {
|
||||
struct srcu_struct io_barrier;
|
||||
};
|
||||
|
||||
int md_in_flight(struct mapped_device *md);
|
||||
void disable_write_same(struct mapped_device *md);
|
||||
void disable_write_zeroes(struct mapped_device *md);
|
||||
|
||||
|
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
|
||||
|
||||
int dm_request_based(struct mapped_device *md)
|
||||
{
|
||||
return queue_is_rq_based(md->queue);
|
||||
return queue_is_mq(md->queue);
|
||||
}
|
||||
|
||||
void dm_start_queue(struct request_queue *q)
|
||||
@@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
|
||||
*/
|
||||
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
|
||||
{
|
||||
atomic_dec(&md->pending[rw]);
|
||||
|
||||
/* nudge anyone waiting on suspend queue */
|
||||
if (!md_in_flight(md))
|
||||
if (unlikely(waitqueue_active(&md->wait)))
|
||||
wake_up(&md->wait);
|
||||
|
||||
/*
|
||||
@@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
|
||||
static void dm_start_request(struct mapped_device *md, struct request *orig)
|
||||
{
|
||||
blk_mq_start_request(orig);
|
||||
atomic_inc(&md->pending[rq_data_dir(orig)]);
|
||||
|
||||
if (unlikely(dm_stats_used(&md->stats))) {
|
||||
struct dm_rq_target_io *tio = tio_from_request(orig);
|
||||
|
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
|
||||
struct request_queue *q = bdev_get_queue(dev->bdev);
|
||||
struct verify_rq_based_data *v = data;
|
||||
|
||||
if (q->mq_ops)
|
||||
if (queue_is_mq(q))
|
||||
v->mq_count++;
|
||||
else
|
||||
v->sq_count++;
|
||||
|
||||
return queue_is_rq_based(q);
|
||||
return queue_is_mq(q);
|
||||
}
|
||||
|
||||
static int dm_table_determine_type(struct dm_table *t)
|
||||
|
@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
|
||||
bio_put(&tio->clone);
|
||||
}
|
||||
|
||||
int md_in_flight(struct mapped_device *md)
|
||||
static bool md_in_flight_bios(struct mapped_device *md)
|
||||
{
|
||||
return atomic_read(&md->pending[READ]) +
|
||||
atomic_read(&md->pending[WRITE]);
|
||||
int cpu;
|
||||
struct hd_struct *part = &dm_disk(md)->part0;
|
||||
long sum = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
|
||||
sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
|
||||
}
|
||||
|
||||
return sum != 0;
|
||||
}
|
||||
|
||||
static bool md_in_flight(struct mapped_device *md)
|
||||
{
|
||||
if (queue_is_mq(md->queue))
|
||||
return blk_mq_queue_inflight(md->queue);
|
||||
else
|
||||
return md_in_flight_bios(md);
|
||||
}
|
||||
|
||||
static void start_io_acct(struct dm_io *io)
|
||||
{
|
||||
struct mapped_device *md = io->md;
|
||||
struct bio *bio = io->orig_bio;
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
io->start_time = jiffies;
|
||||
|
||||
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
|
||||
&dm_disk(md)->part0);
|
||||
|
||||
atomic_set(&dm_disk(md)->part0.in_flight[rw],
|
||||
atomic_inc_return(&md->pending[rw]));
|
||||
|
||||
if (unlikely(dm_stats_used(&md->stats)))
|
||||
dm_stats_account_io(&md->stats, bio_data_dir(bio),
|
||||
bio->bi_iter.bi_sector, bio_sectors(bio),
|
||||
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
|
||||
struct mapped_device *md = io->md;
|
||||
struct bio *bio = io->orig_bio;
|
||||
unsigned long duration = jiffies - io->start_time;
|
||||
int pending;
|
||||
int rw = bio_data_dir(bio);
|
||||
|
||||
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
|
||||
io->start_time);
|
||||
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
|
||||
bio->bi_iter.bi_sector, bio_sectors(bio),
|
||||
true, duration, &io->stats_aux);
|
||||
|
||||
/*
|
||||
* After this is decremented the bio must not be touched if it is
|
||||
* a flush.
|
||||
*/
|
||||
pending = atomic_dec_return(&md->pending[rw]);
|
||||
atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
|
||||
pending += atomic_read(&md->pending[rw^0x1]);
|
||||
|
||||
/* nudge anyone waiting on suspend queue */
|
||||
if (!pending)
|
||||
if (unlikely(waitqueue_active(&md->wait)))
|
||||
wake_up(&md->wait);
|
||||
}
|
||||
|
||||
@@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci)
|
||||
unsigned target_nr = 0;
|
||||
struct dm_target *ti;
|
||||
|
||||
/*
|
||||
* Empty flush uses a statically initialized bio, as the base for
|
||||
* cloning. However, blkg association requires that a bdev is
|
||||
* associated with a gendisk, which doesn't happen until the bdev is
|
||||
* opened. So, blkg association is done at issue time of the flush
|
||||
* rather than when the device is created in alloc_dev().
|
||||
*/
|
||||
bio_set_dev(ci->bio, ci->io->md->bdev);
|
||||
|
||||
BUG_ON(bio_has_data(ci->bio));
|
||||
while ((ti = dm_table_get_target(ci->map, target_nr++)))
|
||||
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
|
||||
|
||||
bio_disassociate_blkg(ci->bio);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
|
||||
init_clone_info(&ci, md, map, bio);
|
||||
|
||||
if (bio->bi_opf & REQ_PREFLUSH) {
|
||||
ci.bio = &ci.io->md->flush_bio;
|
||||
struct bio flush_bio;
|
||||
|
||||
/*
|
||||
* Use an on-stack bio for this, it's safe since we don't
|
||||
* need to reference it after submit. It's just used as
|
||||
* the basis for the clone(s).
|
||||
*/
|
||||
bio_init(&flush_bio, NULL, 0);
|
||||
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
|
||||
ci.bio = &flush_bio;
|
||||
ci.sector_count = 0;
|
||||
error = __send_empty_flush(&ci);
|
||||
/* dec_pending submits any data associated with flush */
|
||||
@@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
|
||||
init_clone_info(&ci, md, map, bio);
|
||||
|
||||
if (bio->bi_opf & REQ_PREFLUSH) {
|
||||
ci.bio = &ci.io->md->flush_bio;
|
||||
struct bio flush_bio;
|
||||
|
||||
/*
|
||||
* Use an on-stack bio for this, it's safe since we don't
|
||||
* need to reference it after submit. It's just used as
|
||||
* the basis for the clone(s).
|
||||
*/
|
||||
bio_init(&flush_bio, NULL, 0);
|
||||
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
|
||||
ci.bio = &flush_bio;
|
||||
ci.sector_count = 0;
|
||||
error = __send_empty_flush(&ci);
|
||||
/* dec_pending submits any data associated with flush */
|
||||
@@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor)
|
||||
INIT_LIST_HEAD(&md->table_devices);
|
||||
spin_lock_init(&md->uevent_lock);
|
||||
|
||||
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
|
||||
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
|
||||
if (!md->queue)
|
||||
goto bad;
|
||||
md->queue->queuedata = md;
|
||||
@@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor)
|
||||
if (!md->disk)
|
||||
goto bad;
|
||||
|
||||
atomic_set(&md->pending[0], 0);
|
||||
atomic_set(&md->pending[1], 0);
|
||||
init_waitqueue_head(&md->wait);
|
||||
INIT_WORK(&md->work, dm_wq_work);
|
||||
init_waitqueue_head(&md->eventq);
|
||||
@@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor)
|
||||
if (!md->bdev)
|
||||
goto bad;
|
||||
|
||||
bio_init(&md->flush_bio, NULL, 0);
|
||||
bio_set_dev(&md->flush_bio, md->bdev);
|
||||
md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
|
||||
|
||||
dm_stats_init(&md->stats);
|
||||
|
||||
/* Populate the mapping, nobody knows we exist yet */
|
||||
|
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
|
||||
const int sgrp = op_stat_group(bio_op(bio));
|
||||
struct mddev *mddev = q->queuedata;
|
||||
unsigned int sectors;
|
||||
int cpu;
|
||||
|
||||
blk_queue_split(q, &bio);
|
||||
|
||||
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
|
||||
|
||||
md_handle_request(mddev, bio);
|
||||
|
||||
cpu = part_stat_lock();
|
||||
part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
|
||||
part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
|
||||
part_stat_lock();
|
||||
part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
|
||||
part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
|
||||
part_stat_unlock();
|
||||
|
||||
return BLK_QC_T_NONE;
|
||||
|
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
||||
!discard_bio)
|
||||
continue;
|
||||
bio_chain(discard_bio, bio);
|
||||
bio_clone_blkcg_association(discard_bio, bio);
|
||||
bio_clone_blkg_association(discard_bio, bio);
|
||||
if (mddev->gendisk)
|
||||
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
|
||||
discard_bio, disk_devt(mddev->gendisk),
|
||||
|
Referens i nytt ärende
Block a user