Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - Two NVMe pull requests: - ana log parse fix from Anton - nvme quirks support for Apple devices from Ben - fix missing bio completion tracing for multipath stack devices from Hannes and Mikhail - IP TOS settings for nvme rdma and tcp transports from Israel - rq_dma_dir cleanups from Israel - tracing for Get LBA Status command from Minwoo - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself - Some consolidation between the fabrics transports for handling the CAP register - reset race with ns scanning fix for fabrics (move fabrics commands to a dedicated request queue with a different lifetime from the admin request queue)." - controller reset and namespace scan races fixes - nvme discovery log change uevent support - naming improvements from Keith - multiple discovery controllers reject fix from James - some regular cleanups from various people - Series fixing (and re-fixing) null_blk debug printing and nr_devices checks (André) - A few pull requests from Song, with fixes from Andy, Guoqing, Guilherme, Neil, Nigel, and Yufen. - REQ_OP_ZONE_RESET_ALL support (Chaitanya) - Bio merge handling unification (Christoph) - Pick default elevator correctly for devices with special needs (Damien) - Block stats fixes (Hou) - Timeout and support devices nbd fixes (Mike) - Series fixing races around elevator switching and device add/remove (Ming) - sed-opal cleanups (Revanth) - Per device weight support for BFQ (Fam) - Support for blk-iocost, a new model that can properly account cost of IO workloads. (Tejun) - blk-cgroup writeback fixes (Tejun) - paride queue init fixes (zhengbin) - blk_set_runtime_active() cleanup (Stanley) - Block segment mapping optimizations (Bart) - lightnvm fixes (Hans/Minwoo/YueHaibing) - Various little fixes and cleanups * tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits) null_blk: format pr_* logs with pr_fmt null_blk: match the type of parameter nr_devices null_blk: do not fail the module load with zero devices block: also check RQF_STATS in blk_mq_need_time_stamp() block: make rq sector size accessible for block stats bfq: Fix bfq linkage error raid5: use bio_end_sector in r5_next_bio raid5: remove STRIPE_OPS_REQ_PENDING md: add feature flag MD_FEATURE_RAID0_LAYOUT md/raid0: avoid RAID0 data corruption due to layout confusion. raid5: don't set STRIPE_HANDLE to stripe which is in batch list raid5: don't increment read_errors on EILSEQ return nvmet: fix a wrong error status returned in error log page nvme: send discovery log page change events to userspace nvme: add uevent variables for controller devices nvme: enable aen regardless of the presence of I/O queues nvme-fabrics: allow discovery subsystems accept a kato nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() nvme: Remove redundant assignment of cq vector nvme: Assign subsys instance from first ctrl ...
This commit is contained in:
120
mm/backing-dev.c
120
mm/backing-dev.c
@@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
#include <linux/wait.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/freezer.h>
|
||||
@@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
|
||||
static struct class *bdi_class;
|
||||
|
||||
/*
|
||||
* bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
|
||||
* locking.
|
||||
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
|
||||
* reader side locking.
|
||||
*/
|
||||
DEFINE_SPINLOCK(bdi_lock);
|
||||
static u64 bdi_id_cursor;
|
||||
static struct rb_root bdi_tree = RB_ROOT;
|
||||
LIST_HEAD(bdi_list);
|
||||
|
||||
/* bdi_wq serves all asynchronous writeback tasks */
|
||||
@@ -615,13 +618,12 @@ out_put:
|
||||
}
|
||||
|
||||
/**
|
||||
* wb_get_create - get wb for a given memcg, create if necessary
|
||||
* wb_get_lookup - get wb for a given memcg
|
||||
* @bdi: target bdi
|
||||
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
|
||||
* @gfp: allocation mask to use
|
||||
*
|
||||
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
|
||||
* create one. The returned wb has its refcount incremented.
|
||||
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
|
||||
* refcount incremented.
|
||||
*
|
||||
* This function uses css_get() on @memcg_css and thus expects its refcnt
|
||||
* to be positive on invocation. IOW, rcu_read_lock() protection on
|
||||
@@ -638,6 +640,39 @@ out_put:
|
||||
* each lookup. On mismatch, the existing wb is discarded and a new one is
|
||||
* created.
|
||||
*/
|
||||
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
|
||||
struct cgroup_subsys_state *memcg_css)
|
||||
{
|
||||
struct bdi_writeback *wb;
|
||||
|
||||
if (!memcg_css->parent)
|
||||
return &bdi->wb;
|
||||
|
||||
rcu_read_lock();
|
||||
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
|
||||
if (wb) {
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
|
||||
/* see whether the blkcg association has changed */
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
||||
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
|
||||
wb = NULL;
|
||||
css_put(blkcg_css);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return wb;
|
||||
}
|
||||
|
||||
/**
|
||||
* wb_get_create - get wb for a given memcg, create if necessary
|
||||
* @bdi: target bdi
|
||||
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
|
||||
* @gfp: allocation mask to use
|
||||
*
|
||||
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
|
||||
* create one. See wb_get_lookup() for more details.
|
||||
*/
|
||||
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
|
||||
struct cgroup_subsys_state *memcg_css,
|
||||
gfp_t gfp)
|
||||
@@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
|
||||
return &bdi->wb;
|
||||
|
||||
do {
|
||||
rcu_read_lock();
|
||||
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
|
||||
if (wb) {
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
|
||||
/* see whether the blkcg association has changed */
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
|
||||
&io_cgrp_subsys);
|
||||
if (unlikely(wb->blkcg_css != blkcg_css ||
|
||||
!wb_tryget(wb)))
|
||||
wb = NULL;
|
||||
css_put(blkcg_css);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
wb = wb_get_lookup(bdi, memcg_css);
|
||||
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
|
||||
|
||||
return wb;
|
||||
@@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
|
||||
}
|
||||
EXPORT_SYMBOL(bdi_alloc_node);
|
||||
|
||||
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
|
||||
{
|
||||
struct rb_node **p = &bdi_tree.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct backing_dev_info *bdi;
|
||||
|
||||
lockdep_assert_held(&bdi_lock);
|
||||
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
|
||||
|
||||
if (bdi->id > id)
|
||||
p = &(*p)->rb_left;
|
||||
else if (bdi->id < id)
|
||||
p = &(*p)->rb_right;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
if (parentp)
|
||||
*parentp = parent;
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* bdi_get_by_id - lookup and get bdi from its id
|
||||
* @id: bdi id to lookup
|
||||
*
|
||||
* Find bdi matching @id and get it. Returns NULL if the matching bdi
|
||||
* doesn't exist or is already unregistered.
|
||||
*/
|
||||
struct backing_dev_info *bdi_get_by_id(u64 id)
|
||||
{
|
||||
struct backing_dev_info *bdi = NULL;
|
||||
struct rb_node **p;
|
||||
|
||||
spin_lock_bh(&bdi_lock);
|
||||
p = bdi_lookup_rb_node(id, NULL);
|
||||
if (*p) {
|
||||
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
|
||||
bdi_get(bdi);
|
||||
}
|
||||
spin_unlock_bh(&bdi_lock);
|
||||
|
||||
return bdi;
|
||||
}
|
||||
|
||||
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
|
||||
{
|
||||
struct device *dev;
|
||||
struct rb_node *parent, **p;
|
||||
|
||||
if (bdi->dev) /* The driver needs to use separate queues per device */
|
||||
return 0;
|
||||
@@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
|
||||
set_bit(WB_registered, &bdi->wb.state);
|
||||
|
||||
spin_lock_bh(&bdi_lock);
|
||||
|
||||
bdi->id = ++bdi_id_cursor;
|
||||
|
||||
p = bdi_lookup_rb_node(bdi->id, &parent);
|
||||
rb_link_node(&bdi->rb_node, parent, p);
|
||||
rb_insert_color(&bdi->rb_node, &bdi_tree);
|
||||
|
||||
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
|
||||
|
||||
spin_unlock_bh(&bdi_lock);
|
||||
|
||||
trace_writeback_bdi_register(bdi);
|
||||
@@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner);
|
||||
static void bdi_remove_from_list(struct backing_dev_info *bdi)
|
||||
{
|
||||
spin_lock_bh(&bdi_lock);
|
||||
rb_erase(&bdi->rb_node, &bdi_tree);
|
||||
list_del_rcu(&bdi->bdi_list);
|
||||
spin_unlock_bh(&bdi_lock);
|
||||
|
||||
|
139
mm/memcontrol.c
139
mm/memcontrol.c
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
|
||||
#define do_swap_account 0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
|
||||
#endif
|
||||
|
||||
/* Whether legacy memory+swap accounting is active */
|
||||
static bool do_memsw_account(void)
|
||||
{
|
||||
@@ -4172,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
|
||||
#include <trace/events/writeback.h>
|
||||
|
||||
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
|
||||
{
|
||||
return wb_domain_init(&memcg->cgwb_domain, gfp);
|
||||
@@ -4255,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Foreign dirty flushing
|
||||
*
|
||||
* There's an inherent mismatch between memcg and writeback. The former
|
||||
* trackes ownership per-page while the latter per-inode. This was a
|
||||
* deliberate design decision because honoring per-page ownership in the
|
||||
* writeback path is complicated, may lead to higher CPU and IO overheads
|
||||
* and deemed unnecessary given that write-sharing an inode across
|
||||
* different cgroups isn't a common use-case.
|
||||
*
|
||||
* Combined with inode majority-writer ownership switching, this works well
|
||||
* enough in most cases but there are some pathological cases. For
|
||||
* example, let's say there are two cgroups A and B which keep writing to
|
||||
* different but confined parts of the same inode. B owns the inode and
|
||||
* A's memory is limited far below B's. A's dirty ratio can rise enough to
|
||||
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
|
||||
* triggering background writeback. A will be slowed down without a way to
|
||||
* make writeback of the dirty pages happen.
|
||||
*
|
||||
* Conditions like the above can lead to a cgroup getting repatedly and
|
||||
* severely throttled after making some progress after each
|
||||
* dirty_expire_interval while the underyling IO device is almost
|
||||
* completely idle.
|
||||
*
|
||||
* Solving this problem completely requires matching the ownership tracking
|
||||
* granularities between memcg and writeback in either direction. However,
|
||||
* the more egregious behaviors can be avoided by simply remembering the
|
||||
* most recent foreign dirtying events and initiating remote flushes on
|
||||
* them when local writeback isn't enough to keep the memory clean enough.
|
||||
*
|
||||
* The following two functions implement such mechanism. When a foreign
|
||||
* page - a page whose memcg and writeback ownerships don't match - is
|
||||
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
|
||||
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
|
||||
* decides that the memcg needs to sleep due to high dirty ratio, it calls
|
||||
* mem_cgroup_flush_foreign() which queues writeback on the recorded
|
||||
* foreign bdi_writebacks which haven't expired. Both the numbers of
|
||||
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
|
||||
* limited to MEMCG_CGWB_FRN_CNT.
|
||||
*
|
||||
* The mechanism only remembers IDs and doesn't hold any object references.
|
||||
* As being wrong occasionally doesn't matter, updates and accesses to the
|
||||
* records are lockless and racy.
|
||||
*/
|
||||
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
struct mem_cgroup *memcg = page->mem_cgroup;
|
||||
struct memcg_cgwb_frn *frn;
|
||||
u64 now = get_jiffies_64();
|
||||
u64 oldest_at = now;
|
||||
int oldest = -1;
|
||||
int i;
|
||||
|
||||
trace_track_foreign_dirty(page, wb);
|
||||
|
||||
/*
|
||||
* Pick the slot to use. If there is already a slot for @wb, keep
|
||||
* using it. If not replace the oldest one which isn't being
|
||||
* written out.
|
||||
*/
|
||||
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
|
||||
frn = &memcg->cgwb_frn[i];
|
||||
if (frn->bdi_id == wb->bdi->id &&
|
||||
frn->memcg_id == wb->memcg_css->id)
|
||||
break;
|
||||
if (time_before64(frn->at, oldest_at) &&
|
||||
atomic_read(&frn->done.cnt) == 1) {
|
||||
oldest = i;
|
||||
oldest_at = frn->at;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < MEMCG_CGWB_FRN_CNT) {
|
||||
/*
|
||||
* Re-using an existing one. Update timestamp lazily to
|
||||
* avoid making the cacheline hot. We want them to be
|
||||
* reasonably up-to-date and significantly shorter than
|
||||
* dirty_expire_interval as that's what expires the record.
|
||||
* Use the shorter of 1s and dirty_expire_interval / 8.
|
||||
*/
|
||||
unsigned long update_intv =
|
||||
min_t(unsigned long, HZ,
|
||||
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
|
||||
|
||||
if (time_before64(frn->at, now - update_intv))
|
||||
frn->at = now;
|
||||
} else if (oldest >= 0) {
|
||||
/* replace the oldest free one */
|
||||
frn = &memcg->cgwb_frn[oldest];
|
||||
frn->bdi_id = wb->bdi->id;
|
||||
frn->memcg_id = wb->memcg_css->id;
|
||||
frn->at = now;
|
||||
}
|
||||
}
|
||||
|
||||
/* issue foreign writeback flushes for recorded foreign dirtying events */
|
||||
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
|
||||
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
|
||||
u64 now = jiffies_64;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
|
||||
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
|
||||
|
||||
/*
|
||||
* If the record is older than dirty_expire_interval,
|
||||
* writeback on it has already started. No need to kick it
|
||||
* off again. Also, don't start a new one if there's
|
||||
* already one in flight.
|
||||
*/
|
||||
if (time_after64(frn->at, now - intv) &&
|
||||
atomic_read(&frn->done.cnt) == 1) {
|
||||
frn->at = 0;
|
||||
trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
|
||||
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
|
||||
WB_REASON_FOREIGN_FLUSH,
|
||||
&frn->done);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_CGROUP_WRITEBACK */
|
||||
|
||||
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
|
||||
@@ -4777,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||
struct mem_cgroup *memcg;
|
||||
unsigned int size;
|
||||
int node;
|
||||
int __maybe_unused i;
|
||||
|
||||
size = sizeof(struct mem_cgroup);
|
||||
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
|
||||
@@ -4820,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
INIT_LIST_HEAD(&memcg->cgwb_list);
|
||||
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
|
||||
memcg->cgwb_frn[i].done =
|
||||
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
return memcg;
|
||||
@@ -4949,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
|
||||
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
int __maybe_unused i;
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
|
||||
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
|
||||
#endif
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
|
||||
static_branch_dec(&memcg_sockets_enabled_key);
|
||||
|
||||
|
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
|
||||
if (unlikely(!writeback_in_progress(wb)))
|
||||
wb_start_background_writeback(wb);
|
||||
|
||||
mem_cgroup_flush_foreign(wb);
|
||||
|
||||
/*
|
||||
* Calculate global domain's pos_ratio and select the
|
||||
* global dtc by default.
|
||||
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
|
||||
task_io_account_write(PAGE_SIZE);
|
||||
current->nr_dirtied++;
|
||||
this_cpu_inc(bdp_ratelimits);
|
||||
|
||||
mem_cgroup_track_foreign_dirty(page, wb);
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user