Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - the rest of MM - procfs updates - various misc things - more y2038 fixes - get_maintainer updates - lib/ updates - checkpatch updates - various epoll updates - autofs updates - hfsplus - some reiserfs work - fatfs updates - signal.c cleanups - ipc/ updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (166 commits) ipc/util.c: update return value of ipc_getref from int to bool ipc/util.c: further variable name cleanups ipc: simplify ipc initialization ipc: get rid of ids->tables_initialized hack lib/rhashtable: guarantee initial hashtable allocation lib/rhashtable: simplify bucket_table_alloc() ipc: drop ipc_lock() ipc/util.c: correct comment in ipc_obtain_object_check ipc: rename ipcctl_pre_down_nolock() ipc/util.c: use ipc_rcu_putref() for failues in ipc_addid() ipc: reorganize initialization of kern_ipc_perm.seq ipc: compute kern_ipc_perm.id under the ipc lock init/Kconfig: remove EXPERT from CHECKPOINT_RESTORE fs/sysv/inode.c: use ktime_get_real_seconds() for superblock stamp adfs: use timespec64 for time conversion kernel/sysctl.c: fix typos in comments drivers/rapidio/devices/rio_mport_cdev.c: remove redundant pointer md fork: don't copy inconsistent signal handler state to child signal: make get_signal() return bool signal: make sigkill_pending() return bool ...
This commit is contained in:
219
mm/oom_kill.c
219
mm/oom_kill.c
@@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
struct task_struct *p;
|
||||
struct task_struct *task;
|
||||
|
||||
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
||||
pr_info("Tasks state (memory values in pages):\n");
|
||||
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
||||
rcu_read_lock();
|
||||
for_each_process(p) {
|
||||
if (oom_unkillable_task(p, memcg, nodemask))
|
||||
@@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||
continue;
|
||||
}
|
||||
|
||||
pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
||||
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
||||
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
||||
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
||||
mm_pgtables_bytes(task->mm),
|
||||
@@ -487,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
||||
static struct task_struct *oom_reaper_list;
|
||||
static DEFINE_SPINLOCK(oom_reaper_lock);
|
||||
|
||||
void __oom_reap_task_mm(struct mm_struct *mm)
|
||||
bool __oom_reap_task_mm(struct mm_struct *mm)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* Tell all users of get_user/copy_from_user etc... that the content
|
||||
@@ -519,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm)
|
||||
struct mmu_gather tlb;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, start, end);
|
||||
mmu_notifier_invalidate_range_start(mm, start, end);
|
||||
if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
|
||||
ret = false;
|
||||
continue;
|
||||
}
|
||||
unmap_page_range(&tlb, vma, start, end, NULL);
|
||||
mmu_notifier_invalidate_range_end(mm, start, end);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reaps the address space of the give task.
|
||||
*
|
||||
* Returns true on success and false if none or part of the address space
|
||||
* has been reclaimed and the caller should retry later.
|
||||
*/
|
||||
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
||||
{
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* We have to make sure to not race with the victim exit path
|
||||
* and cause premature new oom victim selection:
|
||||
* oom_reap_task_mm exit_mm
|
||||
* mmget_not_zero
|
||||
* mmput
|
||||
* atomic_dec_and_test
|
||||
* exit_oom_victim
|
||||
* [...]
|
||||
* out_of_memory
|
||||
* select_bad_process
|
||||
* # no TIF_MEMDIE task selects new victim
|
||||
* unmap_page_range # frees some memory
|
||||
*/
|
||||
mutex_lock(&oom_lock);
|
||||
|
||||
if (!down_read_trylock(&mm->mmap_sem)) {
|
||||
ret = false;
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the mm has invalidate_{start,end}() notifiers that could block,
|
||||
* sleep to give the oom victim some more time.
|
||||
* TODO: we really want to get rid of this ugly hack and make sure that
|
||||
* notifiers cannot block for unbounded amount of time
|
||||
*/
|
||||
if (mm_has_blockable_invalidate_notifiers(mm)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
schedule_timeout_idle(HZ);
|
||||
goto unlock_oom;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -572,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
||||
* down_write();up_write() cycle in exit_mmap().
|
||||
*/
|
||||
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
trace_start_task_reaping(tsk->pid);
|
||||
|
||||
__oom_reap_task_mm(mm);
|
||||
/* failed to reap part of the address space. Try again later */
|
||||
ret = __oom_reap_task_mm(mm);
|
||||
if (!ret)
|
||||
goto out_finish;
|
||||
|
||||
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
||||
task_pid_nr(tsk), tsk->comm,
|
||||
K(get_mm_counter(mm, MM_ANONPAGES)),
|
||||
K(get_mm_counter(mm, MM_FILEPAGES)),
|
||||
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
||||
out_finish:
|
||||
trace_finish_task_reaping(tsk->pid);
|
||||
out_unlock:
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
trace_finish_task_reaping(tsk->pid);
|
||||
unlock_oom:
|
||||
mutex_unlock(&oom_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -843,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
static void __oom_kill_process(struct task_struct *victim)
|
||||
{
|
||||
struct task_struct *p = oc->chosen;
|
||||
unsigned int points = oc->chosen_points;
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
struct task_struct *t;
|
||||
struct task_struct *p;
|
||||
struct mm_struct *mm;
|
||||
unsigned int victim_points = 0;
|
||||
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
bool can_oom_reap = true;
|
||||
|
||||
/*
|
||||
* If the task is already exiting, don't alarm the sysadmin or kill
|
||||
* its children or threads, just give it access to memory reserves
|
||||
* so it can die quickly
|
||||
*/
|
||||
task_lock(p);
|
||||
if (task_will_free_mem(p)) {
|
||||
mark_oom_victim(p);
|
||||
wake_oom_reaper(p);
|
||||
task_unlock(p);
|
||||
put_task_struct(p);
|
||||
return;
|
||||
}
|
||||
task_unlock(p);
|
||||
|
||||
if (__ratelimit(&oom_rs))
|
||||
dump_header(oc, p);
|
||||
|
||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||
message, task_pid_nr(p), p->comm, points);
|
||||
|
||||
/*
|
||||
* If any of p's children has a different mm and is eligible for kill,
|
||||
* the one with the highest oom_badness() score is sacrificed for its
|
||||
* parent. This attempts to lose the minimal amount of work done while
|
||||
* still freeing memory.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
for_each_thread(p, t) {
|
||||
list_for_each_entry(child, &t->children, sibling) {
|
||||
unsigned int child_points;
|
||||
|
||||
if (process_shares_mm(child, p->mm))
|
||||
continue;
|
||||
/*
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child,
|
||||
oc->memcg, oc->nodemask, oc->totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
victim_points = child_points;
|
||||
get_task_struct(victim);
|
||||
}
|
||||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
p = find_lock_task_mm(victim);
|
||||
if (!p) {
|
||||
put_task_struct(victim);
|
||||
@@ -978,6 +908,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
}
|
||||
#undef K
|
||||
|
||||
/*
|
||||
* Kill provided task unless it's secured by setting
|
||||
* oom_score_adj to OOM_SCORE_ADJ_MIN.
|
||||
*/
|
||||
static int oom_kill_memcg_member(struct task_struct *task, void *unused)
|
||||
{
|
||||
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||
get_task_struct(task);
|
||||
__oom_kill_process(task);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
{
|
||||
struct task_struct *p = oc->chosen;
|
||||
unsigned int points = oc->chosen_points;
|
||||
struct task_struct *victim = p;
|
||||
struct task_struct *child;
|
||||
struct task_struct *t;
|
||||
struct mem_cgroup *oom_group;
|
||||
unsigned int victim_points = 0;
|
||||
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
/*
|
||||
* If the task is already exiting, don't alarm the sysadmin or kill
|
||||
* its children or threads, just give it access to memory reserves
|
||||
* so it can die quickly
|
||||
*/
|
||||
task_lock(p);
|
||||
if (task_will_free_mem(p)) {
|
||||
mark_oom_victim(p);
|
||||
wake_oom_reaper(p);
|
||||
task_unlock(p);
|
||||
put_task_struct(p);
|
||||
return;
|
||||
}
|
||||
task_unlock(p);
|
||||
|
||||
if (__ratelimit(&oom_rs))
|
||||
dump_header(oc, p);
|
||||
|
||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||
message, task_pid_nr(p), p->comm, points);
|
||||
|
||||
/*
|
||||
* If any of p's children has a different mm and is eligible for kill,
|
||||
* the one with the highest oom_badness() score is sacrificed for its
|
||||
* parent. This attempts to lose the minimal amount of work done while
|
||||
* still freeing memory.
|
||||
*/
|
||||
read_lock(&tasklist_lock);
|
||||
for_each_thread(p, t) {
|
||||
list_for_each_entry(child, &t->children, sibling) {
|
||||
unsigned int child_points;
|
||||
|
||||
if (process_shares_mm(child, p->mm))
|
||||
continue;
|
||||
/*
|
||||
* oom_badness() returns 0 if the thread is unkillable
|
||||
*/
|
||||
child_points = oom_badness(child,
|
||||
oc->memcg, oc->nodemask, oc->totalpages);
|
||||
if (child_points > victim_points) {
|
||||
put_task_struct(victim);
|
||||
victim = child;
|
||||
victim_points = child_points;
|
||||
get_task_struct(victim);
|
||||
}
|
||||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
|
||||
/*
|
||||
* Do we need to kill the entire memory cgroup?
|
||||
* Or even one of the ancestor memory cgroups?
|
||||
* Check this out before killing the victim task.
|
||||
*/
|
||||
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
|
||||
|
||||
__oom_kill_process(victim);
|
||||
|
||||
/*
|
||||
* If necessary, kill all tasks in the selected memory cgroup.
|
||||
*/
|
||||
if (oom_group) {
|
||||
mem_cgroup_print_oom_group(oom_group);
|
||||
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
|
||||
mem_cgroup_put(oom_group);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||
*/
|
||||
|
Reference in New Issue
Block a user