diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 75a717c03800..bd964e5b395f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2971,6 +2971,26 @@ firmware feature for updating multiple TCE entries at a time. + kswapd_per_node= + kswapd_per_node allows you to control the number of kswapd threads + running on the system. This provides the ability to devote additional + CPU resources toward proactive page replacement with the goal of + reducing direct reclaims. When direct reclaims are prevented, the CPU + consumed by them is prevented as well. Depending on the workload, the + result can cause aggregate CPU usage on the system to go up, down or + stay the same. + + More aggressive page replacement can reduce direct reclaims which + cause latency for tasks and decrease throughput when doing filesystem + IO through the pagecache. Direct reclaims are recorded using the + allocstall counter in /proc/vmstat. + + The range of acceptible values are 1-16. Always start with lower + values in the 2-6 range. Higher values should be justified with + testing. If direct reclaims occur in spite of high values, the cost + of direct reclaims (in latency) that occur can be higher due to + increased lock contention. + onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock] diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1df06a7f10dd..1ccd1ad16015 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -38,6 +38,8 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#define MAX_KSWAPD_THREADS 16 + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -769,6 +771,7 @@ typedef struct pglist_data { wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ + struct task_struct *mkswapd[MAX_KSWAPD_THREADS]; int kswapd_order; enum zone_type kswapd_highest_zoneidx; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4c5a9b2286bf..0facbf9930d3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,6 +171,23 @@ struct scan_control { */ int vm_swappiness = 60; +#define DEF_KSWAPD_THREADS_PER_NODE 1 +int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE; +static int __init kswapd_per_node_setup(char *str) +{ + int tmp; + + if (kstrtoint(str, 0, &tmp) < 0) + return 0; + + if (tmp > MAX_KSWAPD_THREADS || tmp <= 0) + return 0; + + kswapd_threads = tmp; + return 1; +} +__setup("kswapd_per_node=", kswapd_per_node_setup); + static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -3935,6 +3952,46 @@ kswapd_try_sleep: return 0; } +static int kswapd_per_node_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int hid; + int ret = 0; + + for (hid = 0; hid < kswapd_threads; ++hid) { + pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d", + nid, hid); + if (IS_ERR(pgdat->mkswapd[hid])) { + /* failure at boot is fatal */ + WARN_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd%d on node %d\n", + hid, nid); + ret = PTR_ERR(pgdat->mkswapd[hid]); + pgdat->mkswapd[hid] = NULL; + continue; + } + if (!pgdat->kswapd) + pgdat->kswapd = pgdat->mkswapd[hid]; + } + + return ret; +} + +static void kswapd_per_node_stop(int nid) +{ + int hid = 0; + struct task_struct *kswapd; + + for (hid = 0; hid < kswapd_threads; hid++) { + kswapd = NODE_DATA(nid)->mkswapd[hid]; + if (kswapd) { + kthread_stop(kswapd); + NODE_DATA(nid)->mkswapd[hid] = NULL; + } + } + NODE_DATA(nid)->kswapd = NULL; +} + /* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's @@ -4038,6 +4095,9 @@ int kswapd_run(int nid) if (pgdat->kswapd) return 0; + if (kswapd_threads > 1) + return kswapd_per_node_run(nid); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ @@ -4057,6 +4117,11 @@ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + if (kswapd_threads > 1) { + kswapd_per_node_stop(nid); + return; + } + if (kswapd) { kthread_stop(kswapd); NODE_DATA(nid)->kswapd = NULL;