s390/mm: fix 2KB pgtable release race
commit c2c224932fd0ee6854d6ebfc8d059c2bcad86606 upstream.
There is a race on concurrent 2KB-pgtables release paths when
both upper and lower halves of the containing parent page are
freed, one via page_table_free_rcu() + __tlb_remove_table(),
and the other via page_table_free(). The race might lead to a
corruption as result of remove of list item in page_table_free()
concurrently with __free_page() in __tlb_remove_table().
Let's assume first the lower and next the upper 2KB-pgtables are
freed from a page. Since both halves of the page are allocated
the tracking byte (bits 24-31 of the page _refcount) has value
of 0x03 initially:
CPU0				CPU1
----				----
page_table_free_rcu() // lower half
{
	// _refcount[31..24] == 0x03
	...
	atomic_xor_bits(&page->_refcount,
			0x11U << (0 + 24));
	// _refcount[31..24] <= 0x12
	...
	table = table | (1U << 0);
	tlb_remove_table(tlb, table);
}
...
__tlb_remove_table()
{
	// _refcount[31..24] == 0x12
	mask = _table & 3;
	// mask <= 0x01
	...
				page_table_free() // upper half
				{
					// _refcount[31..24] == 0x12
					...
					atomic_xor_bits(
						&page->_refcount,
						1U << (1 + 24));
					// _refcount[31..24] <= 0x10
					// mask <= 0x10
					...
	atomic_xor_bits(&page->_refcount,
			mask << (4 + 24));
	// _refcount[31..24] <= 0x00
	// mask <= 0x00
	...
	if (mask != 0) // == false
		break;
	fallthrough;
	...
					if (mask & 3) // == false
						...
					else
	__free_page(page);			list_del(&page->lru);
	^^^^^^^^^^^^^^^^^^	RACE!		^^^^^^^^^^^^^^^^^^^^^
}					...
				}
The problem is page_table_free() releases the page as result of
lower nibble unset and __tlb_remove_table() observing zero too
early. With this update page_table_free() will use the similar
logic as page_table_free_rcu() + __tlb_remove_table(), and mark
the fragment as pending for removal in the upper nibble until
after the list_del().
In other words, the parent page is considered as unreferenced and
safe to release only when the lower nibble is cleared already and
unsetting a bit in upper nibble results in that nibble turned zero.
Cc: stable@vger.kernel.org
Suggested-by: Vlastimil Babka <vbabka@suse.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
			
			
This commit is contained in:
		 Alexander Gordeev
					Alexander Gordeev
				
			
				
					committed by
					
						 Greg Kroah-Hartman
						Greg Kroah-Hartman
					
				
			
			
				
	
			
			
			 Greg Kroah-Hartman
						Greg Kroah-Hartman
					
				
			
						parent
						
							798754ba48
						
					
				
				
					commit
					ecb71f7bd5
				
			| @@ -253,13 +253,15 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||||||
| 		/* Free 2K page table fragment of a 4K page */ | 		/* Free 2K page table fragment of a 4K page */ | ||||||
| 		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); | 		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); | ||||||
| 		spin_lock_bh(&mm->context.lock); | 		spin_lock_bh(&mm->context.lock); | ||||||
| 		mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); | 		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); | ||||||
| 		mask >>= 24; | 		mask >>= 24; | ||||||
| 		if (mask & 3) | 		if (mask & 3) | ||||||
| 			list_add(&page->lru, &mm->context.pgtable_list); | 			list_add(&page->lru, &mm->context.pgtable_list); | ||||||
| 		else | 		else | ||||||
| 			list_del(&page->lru); | 			list_del(&page->lru); | ||||||
| 		spin_unlock_bh(&mm->context.lock); | 		spin_unlock_bh(&mm->context.lock); | ||||||
|  | 		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); | ||||||
|  | 		mask >>= 24; | ||||||
| 		if (mask != 0) | 		if (mask != 0) | ||||||
| 			return; | 			return; | ||||||
| 	} else { | 	} else { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user