Merge branch 'for-4.14/block-postmerge' of git://git.kernel.dk/linux-block

Pull followup block layer updates from Jens Axboe:
 "I ended up splitting the main pull request for this series into two,
  mainly because of clashes between NVMe fixes that went into 4.13 after
  the for-4.14 branches were split off. This pull request is mostly
  NVMe, but not exclusively. In detail, it contains:

   - Two pull request for NVMe changes from Christoph. Nothing new on
     the feature front, basically just fixes all over the map for the
     core bits, transport, rdma, etc.

   - Series from Bart, cleaning up various bits in the BFQ scheduler.

   - Series of bcache fixes, which has been lingering for a release or
     two. Coly sent this in, but patches from various people in this
     area.

   - Set of patches for BFQ from Paolo himself, updating both
     documentation and fixing some corner cases in performance.

   - Series from Omar, attempting to now get the 4k loop support
     correct. Our confidence level is higher this time.

   - Series from Shaohua for loop as well, improving O_DIRECT
     performance and fixing a use-after-free"

* 'for-4.14/block-postmerge' of git://git.kernel.dk/linux-block: (74 commits)
  bcache: initialize dirty stripes in flash_dev_run()
  loop: set physical block size to logical block size
  bcache: fix bch_hprint crash and improve output
  bcache: Update continue_at() documentation
  bcache: silence static checker warning
  bcache: fix for gc and write-back race
  bcache: increase the number of open buckets
  bcache: Correct return value for sysfs attach errors
  bcache: correct cache_dirty_target in __update_writeback_rate()
  bcache: gc does not work when triggering by manual command
  bcache: Don't reinvent the wheel but use existing llist API
  bcache: do not subtract sectors_to_gc for bypassed IO
  bcache: fix sequential large write IO bypass
  bcache: Fix leak of bdev reference
  block/loop: remove unused field
  block/loop: fix use after free
  bfq: Use icq_to_bic() consistently
  bfq: Suppress compiler warnings about comparisons
  bfq: Check kstrtoul() return value
  bfq: Declare local functions static
  ...
This commit is contained in:
Linus Torvalds
2017-09-09 12:49:01 -07:00
35 changed files with 1141 additions and 815 deletions

View File

@@ -68,6 +68,8 @@
#include <linux/random.h>
#include <trace/events/bcache.h>
#define MAX_OPEN_BUCKETS 128
/* Bucket heap / gen */
uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
spin_lock_init(&c->data_bucket_lock);
for (i = 0; i < 6; i++) {
for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
return -ENOMEM;

View File

@@ -333,6 +333,7 @@ struct cached_dev {
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
struct task_struct *writeback_thread;
struct workqueue_struct *writeback_write_wq;
struct keybuf writeback_keys;

View File

@@ -70,21 +70,10 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
list = llist_del_all(&wait_list->list);
/* We first reverse the list to preserve FIFO ordering and fairness */
while (list) {
struct llist_node *t = list;
list = llist_next(list);
t->next = reverse;
reverse = t;
}
reverse = llist_reverse_order(list);
/* Then do the wakeups */
while (reverse) {
cl = container_of(reverse, struct closure, list);
reverse = llist_next(reverse);
llist_for_each_entry(cl, reverse, list) {
closure_set_waiting(cl, 0);
closure_sub(cl, CLOSURE_WAITING + 1);
}

View File

@@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* been dropped with closure_put()), it will resume execution at @fn running out
* of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
*
* NOTE: This macro expands to a return in the calling function!
*
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
@@ -340,8 +338,6 @@ do { \
* Causes @fn to be executed out of @cl, in @wq context (or called directly if
* @wq is NULL).
*
* NOTE: like continue_at(), this macro expands to a return in the caller!
*
* The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
* thus it's not safe to touch anything protected by @cl after a
* continue_at_nobarrier().

View File

@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
wake_up_gc(op->c);
if (op->bypass)
return bch_data_invalidate(cl);
if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
wake_up_gc(op->c);
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
* flush, it'll wait on the journal write.
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
if (!congested && !dc->sequential_cutoff)
goto rescale;
if (!congested &&
mode == CACHE_MODE_WRITEBACK &&
op_is_write(bio->bi_opf) &&
op_is_sync(bio->bi_opf))
goto rescale;
spin_lock(&dc->io_lock);
hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)

View File

@@ -1026,7 +1026,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(dc);
bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
atomic_inc(&dc->count);
bch_writeback_queue(dc);
@@ -1059,6 +1059,8 @@ static void cached_dev_free(struct closure *cl)
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
mutex_lock(&bch_register_lock);
@@ -1228,6 +1230,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err;
bcache_device_attach(d, c, u - c->uuids);
bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
add_disk(d->disk);
@@ -1374,9 +1377,6 @@ static void cache_set_flush(struct closure *cl)
struct btree *b;
unsigned i;
if (!c)
closure_return(cl);
bch_cache_accounting_destroy(&c->accounting);
kobject_put(&c->internal);
@@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
goto out;
}

View File

@@ -192,7 +192,7 @@ STORE(__cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
unsigned v = size;
ssize_t v = size;
struct cache_set *c;
struct kobj_uevent_env *env;
@@ -227,7 +227,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
v = bch_read_string_list(buf, bch_cache_modes + 1);
if (v < 0)
return v;
@@ -615,8 +615,21 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
if (attr == &sysfs_trigger_gc)
if (attr == &sysfs_trigger_gc) {
/*
* Garbage collection thread only works when sectors_to_gc < 0,
* when users write to sysfs entry trigger_gc, most of time
* they want to forcibly triger gargage collection. Here -1 is
* set to c->sectors_to_gc, to make gc_should_run() give a
* chance to permit gc thread to run. "give a chance" means
* before going into gc_should_run(), there is still chance
* that c->sectors_to_gc being set to other positive value. So
* writing sysfs entry trigger_gc won't always make sure gc
* thread takes effect.
*/
atomic_set(&c->sectors_to_gc, -1);
wake_up_gc(c);
}
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;

View File

@@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int)
STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
/**
* bch_hprint() - formats @v to human readable string for sysfs.
*
* @v - signed 64 bit integer
* @buf - the (at least 8 byte) buffer to format the result into.
*
* Returns the number of bytes used by format.
*/
ssize_t bch_hprint(char *buf, int64_t v)
{
static const char units[] = "?kMGTPEZY";
char dec[4] = "";
int u, t = 0;
int u = 0, t;
for (u = 0; v >= 1024 || v <= -1024; u++) {
t = v & ~(~0 << 10);
v >>= 10;
}
uint64_t q;
if (!u)
return sprintf(buf, "%llu", v);
if (v < 0)
q = -v;
else
q = v;
if (v < 100 && v > -100)
snprintf(dec, sizeof(dec), ".%i", t / 100);
/* For as long as the number is more than 3 digits, but at least
* once, shift right / divide by 1024. Keep the remainder for
* a digit after the decimal point.
*/
do {
u++;
return sprintf(buf, "%lli%s%c", v, dec, units[u]);
t = q & ~(~0 << 10);
q >>= 10;
} while (q >= 1000);
if (v < 0)
/* '-', up to 3 digits, '.', 1 digit, 1 character, null;
* yields 8 bytes.
*/
return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]);
else
return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],

View File

@@ -21,7 +21,8 @@
static void __update_writeback_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
@@ -186,7 +187,7 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
continue_at(cl, write_dirty_finish, system_wq);
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
static void read_dirty_endio(struct bio *bio)
@@ -206,7 +207,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl);
continue_at(cl, write_dirty, system_wq);
continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
static void read_dirty(struct cached_dev *dc)
@@ -481,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
return MAP_CONTINUE;
}
void bch_sectors_dirty_init(struct cached_dev *dc)
void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
bch_btree_op_init(&op.op, -1);
op.inode = dc->disk.id;
op.inode = d->id;
bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -515,6 +516,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
WQ_MEM_RECLAIM, 0);
if (!dc->writeback_write_wq)
return -ENOMEM;
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))

View File

@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
{
uint64_t i, ret = 0;
mutex_lock(&bch_register_lock);
for (i = 0; i < c->nr_uuids; i++) {
struct bcache_device *d = c->devices[i];
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
continue;
ret += bcache_dev_sectors_dirty(d);
}
mutex_unlock(&bch_register_lock);
return ret;
}
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
@@ -84,7 +103,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
void bch_sectors_dirty_init(struct cached_dev *dc);
void bch_sectors_dirty_init(struct bcache_device *);
void bch_cached_dev_writeback_init(struct cached_dev *);
int bch_cached_dev_writeback_start(struct cached_dev *);