Skip to content

Commit 27e1f82

Browse files
liu-song-6torvalds
authored andcommitted
khugepaged: enable collapse pmd for pte-mapped THP
khugepaged needs exclusive mmap_sem to access page table. When it fails to lock mmap_sem, the page will fault in as pte-mapped THP. As the page is already a THP, khugepaged will not handle this pmd again. This patch enables the khugepaged to retry collapse the page table. struct mm_slot (in khugepaged.c) is extended with an array, containing addresses of pte-mapped THPs. We use array here for simplicity. We can easily replace it with more advanced data structures when needed. In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try to collapse the page table. Since collapse may happen at an later time, some pages may already fault in. collapse_pte_mapped_thp() is added to properly handle these pages. collapse_pte_mapped_thp() also double checks whether all ptes in this pmd are mapping to the same THP. This is necessary because some subpage of the THP may be replaced, for example by uprobe. In such cases, it is not possible to collapse the pmd. [[email protected]: add comments for retract_page_tables()] Link: http://lkml.kernel.org/r/20190816145443.6ard3iilytc6jlgv@box Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Song Liu <[email protected]> Signed-off-by: Kirill A. Shutemov <[email protected]> Acked-by: Kirill A. Shutemov <[email protected]> Suggested-by: Johannes Weiner <[email protected]> Reviewed-by: Oleg Nesterov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 5a52c9d commit 27e1f82

File tree

2 files changed

+200
-4
lines changed

2 files changed

+200
-4
lines changed

include/linux/khugepaged.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm);
1515
extern void __khugepaged_exit(struct mm_struct *mm);
1616
extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
1717
unsigned long vm_flags);
18+
#ifdef CONFIG_SHMEM
19+
extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
20+
#else
21+
static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
22+
unsigned long addr)
23+
{
24+
}
25+
#endif
1826

1927
#define khugepaged_enabled() \
2028
(transparent_hugepage_flags & \
@@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
7381
{
7482
return 0;
7583
}
84+
static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
85+
unsigned long addr)
86+
{
87+
}
7688
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
7789

7890
#endif /* _LINUX_KHUGEPAGED_H */

mm/khugepaged.c

Lines changed: 188 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
7777

7878
static struct kmem_cache *mm_slot_cache __read_mostly;
7979

80+
#define MAX_PTE_MAPPED_THP 8
81+
8082
/**
8183
* struct mm_slot - hash lookup from mm to mm_slot
8284
* @hash: hash collision list
@@ -87,6 +89,10 @@ struct mm_slot {
8789
struct hlist_node hash;
8890
struct list_head mm_node;
8991
struct mm_struct *mm;
92+
93+
/* pte-mapped THP in this mm */
94+
int nr_pte_mapped_thp;
95+
unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
9096
};
9197

9298
/**
@@ -1254,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
12541260
}
12551261

12561262
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
1263+
/*
1264+
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1265+
* khugepaged should try to collapse the page table.
1266+
*/
1267+
static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
1268+
unsigned long addr)
1269+
{
1270+
struct mm_slot *mm_slot;
1271+
1272+
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
1273+
1274+
spin_lock(&khugepaged_mm_lock);
1275+
mm_slot = get_mm_slot(mm);
1276+
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
1277+
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
1278+
spin_unlock(&khugepaged_mm_lock);
1279+
return 0;
1280+
}
1281+
1282+
/**
1283+
* Try to collapse a pte-mapped THP for mm at address haddr.
1284+
*
1285+
* This function checks whether all the PTEs in the PMD are pointing to the
1286+
* right THP. If so, retract the page table so the THP can refault in with
1287+
* as pmd-mapped.
1288+
*/
1289+
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
1290+
{
1291+
unsigned long haddr = addr & HPAGE_PMD_MASK;
1292+
struct vm_area_struct *vma = find_vma(mm, haddr);
1293+
struct page *hpage = NULL;
1294+
pte_t *start_pte, *pte;
1295+
pmd_t *pmd, _pmd;
1296+
spinlock_t *ptl;
1297+
int count = 0;
1298+
int i;
1299+
1300+
if (!vma || !vma->vm_file ||
1301+
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
1302+
return;
1303+
1304+
/*
1305+
* This vm_flags may not have VM_HUGEPAGE if the page was not
1306+
* collapsed by this mm. But we can still collapse if the page is
1307+
* the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
1308+
* will not fail the vma for missing VM_HUGEPAGE
1309+
*/
1310+
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
1311+
return;
1312+
1313+
pmd = mm_find_pmd(mm, haddr);
1314+
if (!pmd)
1315+
return;
1316+
1317+
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1318+
1319+
/* step 1: check all mapped PTEs are to the right huge page */
1320+
for (i = 0, addr = haddr, pte = start_pte;
1321+
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1322+
struct page *page;
1323+
1324+
/* empty pte, skip */
1325+
if (pte_none(*pte))
1326+
continue;
1327+
1328+
/* page swapped out, abort */
1329+
if (!pte_present(*pte))
1330+
goto abort;
1331+
1332+
page = vm_normal_page(vma, addr, *pte);
1333+
1334+
if (!page || !PageCompound(page))
1335+
goto abort;
1336+
1337+
if (!hpage) {
1338+
hpage = compound_head(page);
1339+
/*
1340+
* The mapping of the THP should not change.
1341+
*
1342+
* Note that uprobe, debugger, or MAP_PRIVATE may
1343+
* change the page table, but the new page will
1344+
* not pass PageCompound() check.
1345+
*/
1346+
if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
1347+
goto abort;
1348+
}
1349+
1350+
/*
1351+
* Confirm the page maps to the correct subpage.
1352+
*
1353+
* Note that uprobe, debugger, or MAP_PRIVATE may change
1354+
* the page table, but the new page will not pass
1355+
* PageCompound() check.
1356+
*/
1357+
if (WARN_ON(hpage + i != page))
1358+
goto abort;
1359+
count++;
1360+
}
1361+
1362+
/* step 2: adjust rmap */
1363+
for (i = 0, addr = haddr, pte = start_pte;
1364+
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1365+
struct page *page;
1366+
1367+
if (pte_none(*pte))
1368+
continue;
1369+
page = vm_normal_page(vma, addr, *pte);
1370+
page_remove_rmap(page, false);
1371+
}
1372+
1373+
pte_unmap_unlock(start_pte, ptl);
1374+
1375+
/* step 3: set proper refcount and mm_counters. */
1376+
if (hpage) {
1377+
page_ref_sub(hpage, count);
1378+
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
1379+
}
1380+
1381+
/* step 4: collapse pmd */
1382+
ptl = pmd_lock(vma->vm_mm, pmd);
1383+
_pmd = pmdp_collapse_flush(vma, addr, pmd);
1384+
spin_unlock(ptl);
1385+
mm_dec_nr_ptes(mm);
1386+
pte_free(mm, pmd_pgtable(_pmd));
1387+
return;
1388+
1389+
abort:
1390+
pte_unmap_unlock(start_pte, ptl);
1391+
}
1392+
1393+
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1394+
{
1395+
struct mm_struct *mm = mm_slot->mm;
1396+
int i;
1397+
1398+
if (likely(mm_slot->nr_pte_mapped_thp == 0))
1399+
return 0;
1400+
1401+
if (!down_write_trylock(&mm->mmap_sem))
1402+
return -EBUSY;
1403+
1404+
if (unlikely(khugepaged_test_exit(mm)))
1405+
goto out;
1406+
1407+
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
1408+
collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
1409+
1410+
out:
1411+
mm_slot->nr_pte_mapped_thp = 0;
1412+
up_write(&mm->mmap_sem);
1413+
return 0;
1414+
}
1415+
12571416
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12581417
{
12591418
struct vm_area_struct *vma;
@@ -1262,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12621421

12631422
i_mmap_lock_write(mapping);
12641423
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1265-
/* probably overkill */
1424+
/*
1425+
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1426+
* got written to. These VMAs are likely not worth investing
1427+
* down_write(mmap_sem) as PMD-mapping is likely to be split
1428+
* later.
1429+
*
1430+
* Not that vma->anon_vma check is racy: it can be set up after
1431+
* the check but before we took mmap_sem by the fault path.
1432+
* But page lock would prevent establishing any new ptes of the
1433+
* page, so we are safe.
1434+
*
1435+
* An alternative would be drop the check, but check that page
1436+
* table is clear before calling pmdp_collapse_flush() under
1437+
* ptl. It has higher chance to recover THP for the VMA, but
1438+
* has higher cost too.
1439+
*/
12661440
if (vma->anon_vma)
12671441
continue;
12681442
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1275,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12751449
continue;
12761450
/*
12771451
* We need exclusive mmap_sem to retract page table.
1278-
* If trylock fails we would end up with pte-mapped THP after
1279-
* re-fault. Not ideal, but it's more important to not disturb
1280-
* the system too much.
1452+
*
1453+
* We use trylock due to lock inversion: we need to acquire
1454+
* mmap_sem while holding page lock. Fault path does it in
1455+
* reverse order. Trylock is a way to avoid deadlock.
12811456
*/
12821457
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
12831458
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1287,6 +1462,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12871462
up_write(&vma->vm_mm->mmap_sem);
12881463
mm_dec_nr_ptes(vma->vm_mm);
12891464
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1465+
} else {
1466+
/* Try again later */
1467+
khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
12901468
}
12911469
}
12921470
i_mmap_unlock_write(mapping);
@@ -1709,6 +1887,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
17091887
{
17101888
BUILD_BUG();
17111889
}
1890+
1891+
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1892+
{
1893+
return 0;
1894+
}
17121895
#endif
17131896

17141897
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1733,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
17331916
khugepaged_scan.mm_slot = mm_slot;
17341917
}
17351918
spin_unlock(&khugepaged_mm_lock);
1919+
khugepaged_collapse_pte_mapped_thps(mm_slot);
17361920

17371921
mm = mm_slot->mm;
17381922
/*

0 commit comments

Comments
 (0)