Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:

 - Two NVMe pull requests:
     - ana log parse fix from Anton
     - nvme quirks support for Apple devices from Ben
     - fix missing bio completion tracing for multipath stack devices
       from Hannes and Mikhail
     - IP TOS settings for nvme rdma and tcp transports from Israel
     - rq_dma_dir cleanups from Israel
     - tracing for Get LBA Status command from Minwoo
     - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
     - Some consolidation between the fabrics transports for handling
       the CAP register
     - reset race with ns scanning fix for fabrics (move fabrics
       commands to a dedicated request queue with a different lifetime
       from the admin request queue)."
     - controller reset and namespace scan races fixes
     - nvme discovery log change uevent support
     - naming improvements from Keith
     - multiple discovery controllers reject fix from James
     - some regular cleanups from various people

 - Series fixing (and re-fixing) null_blk debug printing and nr_devices
   checks (André)

 - A few pull requests from Song, with fixes from Andy, Guoqing,
   Guilherme, Neil, Nigel, and Yufen.

 - REQ_OP_ZONE_RESET_ALL support (Chaitanya)

 - Bio merge handling unification (Christoph)

 - Pick default elevator correctly for devices with special needs
   (Damien)

 - Block stats fixes (Hou)

 - Timeout and support devices nbd fixes (Mike)

 - Series fixing races around elevator switching and device add/remove
   (Ming)

 - sed-opal cleanups (Revanth)

 - Per device weight support for BFQ (Fam)

 - Support for blk-iocost, a new model that can properly account cost of
   IO workloads. (Tejun)

 - blk-cgroup writeback fixes (Tejun)

 - paride queue init fixes (zhengbin)

 - blk_set_runtime_active() cleanup (Stanley)

 - Block segment mapping optimizations (Bart)

 - lightnvm fixes (Hans/Minwoo/YueHaibing)

 - Various little fixes and cleanups

* tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits)
  null_blk: format pr_* logs with pr_fmt
  null_blk: match the type of parameter nr_devices
  null_blk: do not fail the module load with zero devices
  block: also check RQF_STATS in blk_mq_need_time_stamp()
  block: make rq sector size accessible for block stats
  bfq: Fix bfq linkage error
  raid5: use bio_end_sector in r5_next_bio
  raid5: remove STRIPE_OPS_REQ_PENDING
  md: add feature flag MD_FEATURE_RAID0_LAYOUT
  md/raid0: avoid RAID0 data corruption due to layout confusion.
  raid5: don't set STRIPE_HANDLE to stripe which is in batch list
  raid5: don't increment read_errors on EILSEQ return
  nvmet: fix a wrong error status returned in error log page
  nvme: send discovery log page change events to userspace
  nvme: add uevent variables for controller devices
  nvme: enable aen regardless of the presence of I/O queues
  nvme-fabrics: allow discovery subsystems accept a kato
  nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery()
  nvme: Remove redundant assignment of cq vector
  nvme: Assign subsys instance from first ctrl
  ...
This commit is contained in:
Linus Torvalds
2019-09-17 16:57:47 -07:00
107개의 변경된 파일5894개의 추가작업 그리고 1282개의 파일을 삭제

파일 보기

@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
/*
* bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
* locking.
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
* reader side locking.
*/
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
@@ -615,13 +618,12 @@ out_put:
}
/**
* wb_get_create - get wb for a given memcg, create if necessary
* wb_get_lookup - get wb for a given memcg
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
* @gfp: allocation mask to use
*
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
* create one. The returned wb has its refcount incremented.
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
* refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
@@ -638,6 +640,39 @@ out_put:
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css)
{
struct bdi_writeback *wb;
if (!memcg_css->parent)
return &bdi->wb;
rcu_read_lock();
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb) {
struct cgroup_subsys_state *blkcg_css;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);
}
rcu_read_unlock();
return wb;
}
/**
* wb_get_create - get wb for a given memcg, create if necessary
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
* @gfp: allocation mask to use
*
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
* create one. See wb_get_lookup() for more details.
*/
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
@@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
return &bdi->wb;
do {
rcu_read_lock();
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb) {
struct cgroup_subsys_state *blkcg_css;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
&io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css ||
!wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);
}
rcu_read_unlock();
wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
@@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
EXPORT_SYMBOL(bdi_alloc_node);
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
struct rb_node **p = &bdi_tree.rb_node;
struct rb_node *parent = NULL;
struct backing_dev_info *bdi;
lockdep_assert_held(&bdi_lock);
while (*p) {
parent = *p;
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
if (bdi->id > id)
p = &(*p)->rb_left;
else if (bdi->id < id)
p = &(*p)->rb_right;
else
break;
}
if (parentp)
*parentp = parent;
return p;
}
/**
* bdi_get_by_id - lookup and get bdi from its id
* @id: bdi id to lookup
*
* Find bdi matching @id and get it. Returns NULL if the matching bdi
* doesn't exist or is already unregistered.
*/
struct backing_dev_info *bdi_get_by_id(u64 id)
{
struct backing_dev_info *bdi = NULL;
struct rb_node **p;
spin_lock_bh(&bdi_lock);
p = bdi_lookup_rb_node(id, NULL);
if (*p) {
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
bdi_get(bdi);
}
spin_unlock_bh(&bdi_lock);
return bdi;
}
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
struct device *dev;
struct rb_node *parent, **p;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
@@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
bdi->id = ++bdi_id_cursor;
p = bdi_lookup_rb_node(bdi->id, &parent);
rb_link_node(&bdi->rb_node, parent, p);
rb_insert_color(&bdi->rb_node, &bdi_tree);
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
@@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner);
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi_lock);
rb_erase(&bdi->rb_node, &bdi_tree);
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);

파일 보기

@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -4172,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
#include <trace/events/writeback.h>
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4255,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
}
}
/*
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
* trackes ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
* different cgroups isn't a common use-case.
*
* Combined with inode majority-writer ownership switching, this works well
* enough in most cases but there are some pathological cases. For
* example, let's say there are two cgroups A and B which keep writing to
* different but confined parts of the same inode. B owns the inode and
* A's memory is limited far below B's. A's dirty ratio can rise enough to
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
* Conditions like the above can lead to a cgroup getting repatedly and
* severely throttled after making some progress after each
* dirty_expire_interval while the underyling IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
* granularities between memcg and writeback in either direction. However,
* the more egregious behaviors can be avoided by simply remembering the
* most recent foreign dirtying events and initiating remote flushes on
* them when local writeback isn't enough to keep the memory clean enough.
*
* The following two functions implement such mechanism. When a foreign
* page - a page whose memcg and writeback ownerships don't match - is
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
* decides that the memcg needs to sleep due to high dirty ratio, it calls
* mem_cgroup_flush_foreign() which queues writeback on the recorded
* foreign bdi_writebacks which haven't expired. Both the numbers of
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
* limited to MEMCG_CGWB_FRN_CNT.
*
* The mechanism only remembers IDs and doesn't hold any object references.
* As being wrong occasionally doesn't matter, updates and accesses to the
* records are lockless and racy.
*/
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
int oldest = -1;
int i;
trace_track_foreign_dirty(page, wb);
/*
* Pick the slot to use. If there is already a slot for @wb, keep
* using it. If not replace the oldest one which isn't being
* written out.
*/
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
frn = &memcg->cgwb_frn[i];
if (frn->bdi_id == wb->bdi->id &&
frn->memcg_id == wb->memcg_css->id)
break;
if (time_before64(frn->at, oldest_at) &&
atomic_read(&frn->done.cnt) == 1) {
oldest = i;
oldest_at = frn->at;
}
}
if (i < MEMCG_CGWB_FRN_CNT) {
/*
* Re-using an existing one. Update timestamp lazily to
* avoid making the cacheline hot. We want them to be
* reasonably up-to-date and significantly shorter than
* dirty_expire_interval as that's what expires the record.
* Use the shorter of 1s and dirty_expire_interval / 8.
*/
unsigned long update_intv =
min_t(unsigned long, HZ,
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
if (time_before64(frn->at, now - update_intv))
frn->at = now;
} else if (oldest >= 0) {
/* replace the oldest free one */
frn = &memcg->cgwb_frn[oldest];
frn->bdi_id = wb->bdi->id;
frn->memcg_id = wb->memcg_css->id;
frn->at = now;
}
}
/* issue foreign writeback flushes for recorded foreign dirtying events */
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
u64 now = jiffies_64;
int i;
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
/*
* If the record is older than dirty_expire_interval,
* writeback on it has already started. No need to kick it
* off again. Also, don't start a new one if there's
* already one in flight.
*/
if (time_after64(frn->at, now - intv) &&
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
}
}
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4777,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4820,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4949,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
int __maybe_unused i;
#ifdef CONFIG_CGROUP_WRITEBACK
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

파일 보기

@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
mem_cgroup_flush_foreign(wb);
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
mem_cgroup_track_foreign_dirty(page, wb);
}
}