Merge tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "This is the main pull request for block/storage for 4.21.

  Larger than usual, it was a busy round with lots of goodies queued up.
  Most notable is the removal of the old IO stack, which has been a long
  time coming. No new features for a while, everything coming in this
  week has all been fixes for things that were previously merged.

  This contains:

   - Use atomic counters instead of semaphores for mtip32xx (Arnd)

   - Cleanup of the mtip32xx request setup (Christoph)

   - Fix for circular locking dependency in loop (Jan, Tetsuo)

   - bcache (Coly, Guoju, Shenghui)
      * Optimizations for writeback caching
      * Various fixes and improvements

   - nvme (Chaitanya, Christoph, Sagi, Jay, me, Keith)
      * host and target support for NVMe over TCP
      * Error log page support
      * Support for separate read/write/poll queues
      * Much improved polling
      * discard OOM fallback
      * Tracepoint improvements

   - lightnvm (Hans, Hua, Igor, Matias, Javier)
      * Igor added packed metadata to pblk. Now drives without metadata
        per LBA can be used as well.
      * Fix from Geert on uninitialized value on chunk metadata reads.
      * Fixes from Hans and Javier to pblk recovery and write path.
      * Fix from Hua Su to fix a race condition in the pblk recovery
        code.
      * Scan optimization added to pblk recovery from Zhoujie.
      * Small geometry cleanup from me.

   - Conversion of the last few drivers that used the legacy path to
     blk-mq (me)

   - Removal of legacy IO path in SCSI (me, Christoph)

   - Removal of legacy IO stack and schedulers (me)

   - Support for much better polling, now without interrupts at all.
     blk-mq adds support for multiple queue maps, which enables us to
     have a map per type. This in turn enables nvme to have separate
     completion queues for polling, which can then be interrupt-less.
     Also means we're ready for async polled IO, which is hopefully
     coming in the next release.

   - Killing of (now) unused block exports (Christoph)

   - Unification of the blk-rq-qos and blk-wbt wait handling (Josef)

   - Support for zoned testing with null_blk (Masato)

   - sx8 conversion to per-host tag sets (Christoph)

   - IO priority improvements (Damien)

   - mq-deadline zoned fix (Damien)

   - Ref count blkcg series (Dennis)

   - Lots of blk-mq improvements and speedups (me)

   - sbitmap scalability improvements (me)

   - Make core inflight IO accounting per-cpu (Mikulas)

   - Export timeout setting in sysfs (Weiping)

   - Cleanup the direct issue path (Jianchao)

   - Export blk-wbt internals in block debugfs for easier debugging
     (Ming)

   - Lots of other fixes and improvements"

* tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block: (364 commits)
  kyber: use sbitmap add_wait_queue/list_del wait helpers
  sbitmap: add helpers for add/del wait queue handling
  block: save irq state in blkg_lookup_create()
  dm: don't reuse bio for flushes
  nvme-pci: trace SQ status on completions
  nvme-rdma: implement polling queue map
  nvme-fabrics: allow user to pass in nr_poll_queues
  nvme-fabrics: allow nvmf_connect_io_queue to poll
  nvme-core: optionally poll sync commands
  block: make request_to_qc_t public
  nvme-tcp: fix spelling mistake "attepmpt" -> "attempt"
  nvme-tcp: fix endianess annotations
  nvmet-tcp: fix endianess annotations
  nvme-pci: refactor nvme_poll_irqdisable to make sparse happy
  nvme-pci: only set nr_maps to 2 if poll queues are supported
  nvmet: use a macro for default error location
  nvmet: fix comparison of a u16 with -1
  blk-mq: enable IO poll if .nr_queues of type poll > 0
  blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight()
  blk-mq: skip zero-queue maps in blk_mq_map_swqueue
  ...
This commit is contained in:
Linus Torvalds
2018-12-28 13:19:59 -08:00
förälder b12a9124ee 00203ba40d
incheckning 0e9da3fbf7
246 ändrade filer med 10469 tillägg och 14370 borttagningar

Visa fil

@@ -626,6 +626,20 @@ struct cache_set {
/* Where in the btree gc currently is */
struct bkey gc_done;
/*
* For automatical garbage collection after writeback completed, this
* varialbe is used as bit fields,
* - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
* - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
* This is an optimization for following write request after writeback
* finished, but read hit rate dropped due to clean data on cache is
* discarded. Unless user explicitly sets it via sysfs, it won't be
* enabled.
*/
#define BCH_ENABLE_AUTO_GC 1
#define BCH_DO_AUTO_GC 2
uint8_t gc_after_writeback;
/*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress. Protected by bucket_lock.
@@ -658,7 +672,11 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
* on the stack - have to dynamically allocate them.
* bch_cache_set_alloc() will make sure the pool can allocate iterators
* equipped with enough room that can host
* (sb.bucket_size / sb.block_size)
* btree_iter_sets, which is more than static MAX_BSETS.
*/
mempool_t fill_iter;

Visa fil

@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
/*
* c->fill_iter can allocate an iterator with more memory space
* than static MAX_BSETS.
* See the comment arount cache_set->fill_iter.
*/
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;

Visa fil

@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
wake_up(&c->gc_wait);
}
static inline void force_wake_up_gc(struct cache_set *c)
{
/*
* Garbage collection thread only works when sectors_to_gc < 0,
* calling wake_up_gc() won't start gc thread if sectors_to_gc is
* not a nagetive value.
* Therefore sectors_to_gc is set to -1 here, before waking up
* gc thread by calling wake_up_gc(). Then gc_should_run() will
* give a chance to permit gc thread to run. "Give a chance" means
* before going into gc_should_run(), there is still possibility
* that c->sectors_to_gc being set to other positive value. So
* this routine won't 100% make sure gc thread will be woken up
* to run.
*/
atomic_set(&c->sectors_to_gc, -1);
wake_up_gc(c);
}
#define MAP_DONE 0
#define MAP_CONTINUE 1

Visa fil

@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
void bch_debug_exit(void)
{
if (!IS_ERR_OR_NULL(bcache_debug))
debugfs_remove_recursive(bcache_debug);
debugfs_remove_recursive(bcache_debug);
}
void __init bch_debug_init(void)

Visa fil

@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bch_bio_map(bio, w->data);
trace_bcache_journal_write(bio);
trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);

Visa fil

@@ -311,11 +311,11 @@ err:
* data is written it calls bch_journal, and after the keys have been added to
* the next journal write they're inserted into the btree.
*
* It inserts the data in s->cache_bio; bi_sector is used for the key offset,
* It inserts the data in op->bio; bi_sector is used for the key offset,
* and op->inode is used for the key inode.
*
* If s->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by s->cache_bio and op->inode.
* If op->bypass is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
{

Visa fil

@@ -25,8 +25,8 @@
#include <linux/reboot.h>
#include <linux/sysfs.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
unsigned int bch_cutoff_writeback;
unsigned int bch_cutoff_writeback_sync;
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
struct cache *ca;
unsigned int i;
if (!IS_ERR_OR_NULL(c->debug))
debugfs_remove(c->debug);
debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
@@ -2424,6 +2423,32 @@ static void bcache_exit(void)
mutex_destroy(&bch_register_lock);
}
/* Check and fixup module parameters */
static void check_module_parameters(void)
{
if (bch_cutoff_writeback_sync == 0)
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
}
if (bch_cutoff_writeback == 0)
bch_cutoff_writeback = CUTOFF_WRITEBACK;
else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
pr_warn("set bch_cutoff_writeback (%u) to max value %u",
bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
}
if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
pr_warn("set bch_cutoff_writeback (%u) to %u",
bch_cutoff_writeback, bch_cutoff_writeback_sync);
bch_cutoff_writeback = bch_cutoff_writeback_sync;
}
}
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
@@ -2432,6 +2457,8 @@ static int __init bcache_init(void)
NULL
};
check_module_parameters();
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
@@ -2468,5 +2495,18 @@ err:
return -ENOMEM;
}
/*
* Module hooks
*/
module_exit(bcache_exit);
module_init(bcache_init);
module_param(bch_cutoff_writeback, uint, 0);
MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
module_param(bch_cutoff_writeback_sync, uint, 0);
MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_LICENSE("GPL");

Visa fil

@@ -16,7 +16,7 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
/* Default is -1; we skip past it for struct cached_dev's cache mode */
/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
NULL
};
/* Default is -1; we skip past it for stop_when_cache_set_failed */
/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
"always",
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
read_attribute(congested);
read_attribute(cutoff_writeback);
read_attribute(cutoff_writeback_sync);
rw_attribute(congested_read_threshold_us);
rw_attribute(congested_write_threshold_us);
@@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
rw_attribute(gc_after_writeback);
rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf,
@@ -264,7 +267,8 @@ STORE(__cached_dev)
d_strtoul(writeback_running);
d_strtoul(writeback_delay);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) {
ssize_t ret;
@@ -384,8 +388,25 @@ STORE(bch_cached_dev)
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
if (attr == &sysfs_writeback_running)
bch_writeback_queue(dc);
if (attr == &sysfs_writeback_running) {
/* dc->writeback_running changed in __cached_dev_store() */
if (IS_ERR_OR_NULL(dc->writeback_thread)) {
/*
* reject setting it to 1 via sysfs if writeback
* kthread is not created yet.
*/
if (dc->writeback_running) {
dc->writeback_running = false;
pr_err("%s: failed to run non-existent writeback thread",
dc->disk.disk->disk_name);
}
} else
/*
* writeback kthread will check if dc->writeback_running
* is true or false.
*/
bch_writeback_queue(dc);
}
if (attr == &sysfs_writeback_percent)
if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
@@ -668,6 +689,9 @@ SHOW(__bch_cache_set)
sysfs_print(congested_write_threshold_us,
c->congested_write_threshold_us);
sysfs_print(cutoff_writeback, bch_cutoff_writeback);
sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
sysfs_printf(verify, "%i", c->verify);
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +700,7 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -725,21 +750,8 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
if (attr == &sysfs_trigger_gc) {
/*
* Garbage collection thread only works when sectors_to_gc < 0,
* when users write to sysfs entry trigger_gc, most of time
* they want to forcibly triger gargage collection. Here -1 is
* set to c->sectors_to_gc, to make gc_should_run() give a
* chance to permit gc thread to run. "give a chance" means
* before going into gc_should_run(), there is still chance
* that c->sectors_to_gc being set to other positive value. So
* writing sysfs entry trigger_gc won't always make sure gc
* thread takes effect.
*/
atomic_set(&c->sectors_to_gc, -1);
wake_up_gc(c);
}
if (attr == &sysfs_trigger_gc)
force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -789,6 +801,12 @@ STORE(__bch_cache_set)
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
/*
* write gc_after_writeback here may overwrite an already set
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be
* set in next chance.
*/
sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size;
}
@@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
&sysfs_gc_after_writeback,
&sysfs_io_disable,
&sysfs_cutoff_writeback,
&sysfs_cutoff_writeback_sync,
NULL
};
KTYPE(bch_cache_set_internal);

Visa fil

@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
static void update_gc_after_writeback(struct cache_set *c)
{
if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
return;
c->gc_after_writeback |= BCH_DO_AUTO_GC;
}
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
break;
}
/*
* When dirty data rate is high (e.g. 50%+), there might
* be heavy buckets fragmentation after writeback
* finished, which hurts following write performance.
* If users really care about write performance they
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
* BCH_DO_AUTO_GC is set, garbage collection thread
* will be wake up here. After moving gc, the shrunk
* btree and discarded free buckets SSD space may be
* helpful for following write requests.
*/
if (c->gc_after_writeback ==
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
force_wake_up_gc(c);
}
}
up_write(&dc->writeback_lock);
@@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
dc->writeback_running = true;
dc->writeback_running = false;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
cached_dev_put(dc);
return PTR_ERR(dc->writeback_thread);
}
dc->writeback_running = true;
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,

Visa fil

@@ -5,12 +5,17 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
#define CUTOFF_WRITEBACK_MAX 70
#define CUTOFF_WRITEBACK_SYNC_MAX 90
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
}
}
extern unsigned int bch_cutoff_writeback;
extern unsigned int bch_cutoff_writeback_sync;
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned int cache_mode, bool would_skip)
{
@@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (cache_mode != CACHE_MODE_WRITEBACK ||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
in_use > CUTOFF_WRITEBACK_SYNC)
in_use > bch_cutoff_writeback_sync)
return false;
if (dc->partial_stripes_expensive &&
@@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
return (op_is_sync(bio->bi_opf) ||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
in_use <= CUTOFF_WRITEBACK);
in_use <= bch_cutoff_writeback);
}
static inline void bch_writeback_queue(struct cached_dev *dc)

Visa fil

@@ -65,7 +65,6 @@ struct mapped_device {
*/
struct work_struct work;
wait_queue_head_t wait;
atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
@@ -107,9 +106,6 @@ struct mapped_device {
struct block_device *bdev;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
struct dm_stats stats;
/* for blk-mq request-based DM support */
@@ -119,7 +115,6 @@ struct mapped_device {
struct srcu_struct io_barrier;
};
int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);

Visa fil

@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
int dm_request_based(struct mapped_device *md)
{
return queue_is_rq_based(md->queue);
return queue_is_mq(md->queue);
}
void dm_start_queue(struct request_queue *q)
@@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
if (!md_in_flight(md))
if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
/*
@@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
blk_mq_start_request(orig);
atomic_inc(&md->pending[rq_data_dir(orig)]);
if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);

Visa fil

@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
struct request_queue *q = bdev_get_queue(dev->bdev);
struct verify_rq_based_data *v = data;
if (q->mq_ops)
if (queue_is_mq(q))
v->mq_count++;
else
v->sq_count++;
return queue_is_rq_based(q);
return queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)

Visa fil

@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}
int md_in_flight(struct mapped_device *md)
static bool md_in_flight_bios(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
atomic_read(&md->pending[WRITE]);
int cpu;
struct hd_struct *part = &dm_disk(md)->part0;
long sum = 0;
for_each_possible_cpu(cpu) {
sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
}
return sum != 0;
}
static bool md_in_flight(struct mapped_device *md)
{
if (queue_is_mq(md->queue))
return blk_mq_queue_inflight(md->queue);
else
return md_in_flight_bios(md);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
int rw = bio_data_dir(bio);
io->start_time = jiffies;
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
int pending;
int rw = bio_data_dir(bio);
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
/*
* After this is decremented the bio must not be touched if it is
* a flush.
*/
pending = atomic_dec_return(&md->pending[rw]);
atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
pending += atomic_read(&md->pending[rw^0x1]);
/* nudge anyone waiting on suspend queue */
if (!pending)
if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
}
@@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci)
unsigned target_nr = 0;
struct dm_target *ti;
/*
* Empty flush uses a statically initialized bio, as the base for
* cloning. However, blkg association requires that a bdev is
* associated with a gendisk, which doesn't happen until the bdev is
* opened. So, blkg association is done at issue time of the flush
* rather than when the device is created in alloc_dev().
*/
bio_set_dev(ci->bio, ci->io->md->bdev);
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
bio_disassociate_blkg(ci->bio);
return 0;
}
@@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
ci.bio = &ci.io->md->flush_bio;
struct bio flush_bio;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
bio_init(&flush_bio, NULL, 0);
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
ci.bio = &ci.io->md->flush_bio;
struct bio flush_bio;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
bio_init(&flush_bio, NULL, 0);
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
md->queue->queuedata = md;
@@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->disk)
goto bad;
atomic_set(&md->pending[0], 0);
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
@@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad;
bio_init(&md->flush_bio, NULL, 0);
bio_set_dev(&md->flush_bio, md->bdev);
md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */

Visa fil

@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
int cpu;
blk_queue_split(q, &bio);
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
cpu = part_stat_lock();
part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_lock();
part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;

Visa fil

@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
!discard_bio)
continue;
bio_chain(discard_bio, bio);
bio_clone_blkcg_association(discard_bio, bio);
bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),