Pull Automatic NUMA Balancing bare-bones from Mel Gorman:
"There are three implementations for NUMA balancing, this tree
(balancenuma), numacore which has been developed in tip/master and
autonuma which is in aa.git.
In almost all respects balancenuma is the dumbest of the three because
its main impact is on the VM side with no attempt to be smart about
scheduling. In the interest of getting the ball rolling, it would be
desirable to see this much merged for 3.8 with the view to building
scheduler smarts on top and adapting the VM where required for 3.9.
The most recent set of comparisons available from different people are
mel: https://lkml.org/lkml/2012/12/9/108
mingo: https://lkml.org/lkml/2012/12/7/331
tglx: https://lkml.org/lkml/2012/12/10/437
srikar: https://lkml.org/lkml/2012/12/10/397
The results are a mixed bag. In my own tests, balancenuma does
reasonably well. It's dumb as rocks and does not regress against
mainline. On the other hand, Ingo's tests shows that balancenuma is
incapable of converging for this workloads driven by perf which is bad
but is potentially explained by the lack of scheduler smarts. Thomas'
results show balancenuma improves on mainline but falls far short of
numacore or autonuma. Srikar's results indicate we all suffer on a
large machine with imbalanced node sizes.
My own testing showed that recent numacore results have improved
dramatically, particularly in the last week but not universally.
We've butted heads heavily on system CPU usage and high levels of
migration even when it shows that overall performance is better.
There are also cases where it regresses. Of interest is that for
specjbb in some configurations it will regress for lower numbers of
warehouses and show gains for higher numbers which is not reported by
the tool by default and sometimes missed in treports. Recently I
reported for numacore that the JVM was crashing with
NullPointerExceptions but currently it's unclear what the source of
this problem is. Initially I thought it was in how numacore batch
handles PTEs but I'm no longer think this is the case. It's possible
numacore is just able to trigger it due to higher rates of migration.
These reports were quite late in the cycle so I/we would like to start
with this tree as it contains much of the code we can agree on and has
not changed significantly over the last 2-3 weeks."
* tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits)
mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable
mm/rmap: Convert the struct anon_vma::mutex to an rwsem
mm: migrate: Account a transhuge page properly when rate limiting
mm: numa: Account for failed allocations and isolations as migration failures
mm: numa: Add THP migration for the NUMA working set scanning fault case build fix
mm: numa: Add THP migration for the NUMA working set scanning fault case.
mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node
mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG
mm: sched: numa: Control enabling and disabling of NUMA balancing
mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate
mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships
mm: numa: migrate: Set last_nid on newly allocated page
mm: numa: split_huge_page: Transfer last_nid on tail page
mm: numa: Introduce last_nid to the page frame
sched: numa: Slowly increase the scanning period as NUMA faults are handled
mm: numa: Rate limit setting of pte_numa if node is saturated
mm: numa: Rate limit the amount of memory that is migrated between nodes
mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting
mm: numa: Migrate pages handled during a pmd_numa hinting fault
mm: numa: Migrate on reference policy
...
227 lines
7.2 KiB
C
227 lines
7.2 KiB
C
#ifndef _LINUX_HUGE_MM_H
|
|
#define _LINUX_HUGE_MM_H
|
|
|
|
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
unsigned int flags);
|
|
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
|
struct vm_area_struct *vma);
|
|
extern void huge_pmd_set_accessed(struct mm_struct *mm,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
pmd_t orig_pmd, int dirty);
|
|
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
pmd_t orig_pmd);
|
|
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
|
|
unsigned long addr,
|
|
pmd_t *pmd,
|
|
unsigned int flags);
|
|
extern int zap_huge_pmd(struct mmu_gather *tlb,
|
|
struct vm_area_struct *vma,
|
|
pmd_t *pmd, unsigned long addr);
|
|
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned char *vec);
|
|
extern int move_huge_pmd(struct vm_area_struct *vma,
|
|
struct vm_area_struct *new_vma,
|
|
unsigned long old_addr,
|
|
unsigned long new_addr, unsigned long old_end,
|
|
pmd_t *old_pmd, pmd_t *new_pmd);
|
|
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr, pgprot_t newprot,
|
|
int prot_numa);
|
|
|
|
enum transparent_hugepage_flag {
|
|
TRANSPARENT_HUGEPAGE_FLAG,
|
|
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
|
|
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
|
|
#ifdef CONFIG_DEBUG_VM
|
|
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
|
|
#endif
|
|
};
|
|
|
|
enum page_check_address_pmd_flag {
|
|
PAGE_CHECK_ADDRESS_PMD_FLAG,
|
|
PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
|
|
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
|
|
};
|
|
extern pmd_t *page_check_address_pmd(struct page *page,
|
|
struct mm_struct *mm,
|
|
unsigned long address,
|
|
enum page_check_address_pmd_flag flag);
|
|
|
|
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
|
|
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
#define HPAGE_PMD_SHIFT HPAGE_SHIFT
|
|
#define HPAGE_PMD_MASK HPAGE_MASK
|
|
#define HPAGE_PMD_SIZE HPAGE_SIZE
|
|
|
|
extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
|
|
|
|
#define transparent_hugepage_enabled(__vma) \
|
|
((transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \
|
|
((__vma)->vm_flags & VM_HUGEPAGE))) && \
|
|
!((__vma)->vm_flags & VM_NOHUGEPAGE) && \
|
|
!is_vma_temporary_stack(__vma))
|
|
#define transparent_hugepage_defrag(__vma) \
|
|
((transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) || \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
|
|
(__vma)->vm_flags & VM_HUGEPAGE))
|
|
#define transparent_hugepage_use_zero_page() \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
|
|
#ifdef CONFIG_DEBUG_VM
|
|
#define transparent_hugepage_debug_cow() \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
|
|
#else /* CONFIG_DEBUG_VM */
|
|
#define transparent_hugepage_debug_cow() 0
|
|
#endif /* CONFIG_DEBUG_VM */
|
|
|
|
extern unsigned long transparent_hugepage_flags;
|
|
extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pmd_t *dst_pmd, pmd_t *src_pmd,
|
|
struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end);
|
|
extern int handle_pte_fault(struct mm_struct *mm,
|
|
struct vm_area_struct *vma, unsigned long address,
|
|
pte_t *pte, pmd_t *pmd, unsigned int flags);
|
|
extern int split_huge_page(struct page *page);
|
|
extern void __split_huge_page_pmd(struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd);
|
|
#define split_huge_page_pmd(__vma, __address, __pmd) \
|
|
do { \
|
|
pmd_t *____pmd = (__pmd); \
|
|
if (unlikely(pmd_trans_huge(*____pmd))) \
|
|
__split_huge_page_pmd(__vma, __address, \
|
|
____pmd); \
|
|
} while (0)
|
|
#define wait_split_huge_page(__anon_vma, __pmd) \
|
|
do { \
|
|
pmd_t *____pmd = (__pmd); \
|
|
anon_vma_lock_write(__anon_vma); \
|
|
anon_vma_unlock(__anon_vma); \
|
|
BUG_ON(pmd_trans_splitting(*____pmd) || \
|
|
pmd_trans_huge(*____pmd)); \
|
|
} while (0)
|
|
extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd);
|
|
#if HPAGE_PMD_ORDER > MAX_ORDER
|
|
#error "hugepages can't be allocated by the buddy allocator"
|
|
#endif
|
|
extern int hugepage_madvise(struct vm_area_struct *vma,
|
|
unsigned long *vm_flags, int advice);
|
|
extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
long adjust_next);
|
|
extern int __pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma);
|
|
/* mmap_sem must be held on entry */
|
|
static inline int pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
|
|
if (pmd_trans_huge(*pmd))
|
|
return __pmd_trans_huge_lock(pmd, vma);
|
|
else
|
|
return 0;
|
|
}
|
|
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
long adjust_next)
|
|
{
|
|
if (!vma->anon_vma || vma->vm_ops)
|
|
return;
|
|
__vma_adjust_trans_huge(vma, start, end, adjust_next);
|
|
}
|
|
static inline int hpage_nr_pages(struct page *page)
|
|
{
|
|
if (unlikely(PageTransHuge(page)))
|
|
return HPAGE_PMD_NR;
|
|
return 1;
|
|
}
|
|
static inline struct page *compound_trans_head(struct page *page)
|
|
{
|
|
if (PageTail(page)) {
|
|
struct page *head;
|
|
head = page->first_page;
|
|
smp_rmb();
|
|
/*
|
|
* head may be a dangling pointer.
|
|
* __split_huge_page_refcount clears PageTail before
|
|
* overwriting first_page, so if PageTail is still
|
|
* there it means the head pointer isn't dangling.
|
|
*/
|
|
if (PageTail(page))
|
|
return head;
|
|
}
|
|
return page;
|
|
}
|
|
|
|
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
|
|
|
|
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
|
|
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
|
|
#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
|
|
|
|
#define hpage_nr_pages(x) 1
|
|
|
|
#define transparent_hugepage_enabled(__vma) 0
|
|
|
|
#define transparent_hugepage_flags 0UL
|
|
static inline int split_huge_page(struct page *page)
|
|
{
|
|
return 0;
|
|
}
|
|
#define split_huge_page_pmd(__vma, __address, __pmd) \
|
|
do { } while (0)
|
|
#define wait_split_huge_page(__anon_vma, __pmd) \
|
|
do { } while (0)
|
|
#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
|
|
do { } while (0)
|
|
#define compound_trans_head(page) compound_head(page)
|
|
static inline int hugepage_madvise(struct vm_area_struct *vma,
|
|
unsigned long *vm_flags, int advice)
|
|
{
|
|
BUG();
|
|
return 0;
|
|
}
|
|
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
long adjust_next)
|
|
{
|
|
}
|
|
static inline int pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
#endif /* _LINUX_HUGE_MM_H */
|