@@ -77,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
7777
7878static struct kmem_cache * mm_slot_cache __read_mostly ;
7979
80+ #define MAX_PTE_MAPPED_THP 8
81+
8082/**
8183 * struct mm_slot - hash lookup from mm to mm_slot
8284 * @hash: hash collision list
@@ -87,6 +89,10 @@ struct mm_slot {
8789 struct hlist_node hash ;
8890 struct list_head mm_node ;
8991 struct mm_struct * mm ;
92+
93+ /* pte-mapped THP in this mm */
94+ int nr_pte_mapped_thp ;
95+ unsigned long pte_mapped_thp [MAX_PTE_MAPPED_THP ];
9096};
9197
9298/**
@@ -1254,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
12541260}
12551261
12561262#if defined(CONFIG_SHMEM ) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE )
1263+ /*
1264+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1265+ * khugepaged should try to collapse the page table.
1266+ */
1267+ static int khugepaged_add_pte_mapped_thp (struct mm_struct * mm ,
1268+ unsigned long addr )
1269+ {
1270+ struct mm_slot * mm_slot ;
1271+
1272+ VM_BUG_ON (addr & ~HPAGE_PMD_MASK );
1273+
1274+ spin_lock (& khugepaged_mm_lock );
1275+ mm_slot = get_mm_slot (mm );
1276+ if (likely (mm_slot && mm_slot -> nr_pte_mapped_thp < MAX_PTE_MAPPED_THP ))
1277+ mm_slot -> pte_mapped_thp [mm_slot -> nr_pte_mapped_thp ++ ] = addr ;
1278+ spin_unlock (& khugepaged_mm_lock );
1279+ return 0 ;
1280+ }
1281+
1282+ /**
1283+ * Try to collapse a pte-mapped THP for mm at address haddr.
1284+ *
1285+ * This function checks whether all the PTEs in the PMD are pointing to the
1286+ * right THP. If so, retract the page table so the THP can refault in with
1287+ * as pmd-mapped.
1288+ */
1289+ void collapse_pte_mapped_thp (struct mm_struct * mm , unsigned long addr )
1290+ {
1291+ unsigned long haddr = addr & HPAGE_PMD_MASK ;
1292+ struct vm_area_struct * vma = find_vma (mm , haddr );
1293+ struct page * hpage = NULL ;
1294+ pte_t * start_pte , * pte ;
1295+ pmd_t * pmd , _pmd ;
1296+ spinlock_t * ptl ;
1297+ int count = 0 ;
1298+ int i ;
1299+
1300+ if (!vma || !vma -> vm_file ||
1301+ vma -> vm_start > haddr || vma -> vm_end < haddr + HPAGE_PMD_SIZE )
1302+ return ;
1303+
1304+ /*
1305+ * This vm_flags may not have VM_HUGEPAGE if the page was not
1306+ * collapsed by this mm. But we can still collapse if the page is
1307+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
1308+ * will not fail the vma for missing VM_HUGEPAGE
1309+ */
1310+ if (!hugepage_vma_check (vma , vma -> vm_flags | VM_HUGEPAGE ))
1311+ return ;
1312+
1313+ pmd = mm_find_pmd (mm , haddr );
1314+ if (!pmd )
1315+ return ;
1316+
1317+ start_pte = pte_offset_map_lock (mm , pmd , haddr , & ptl );
1318+
1319+ /* step 1: check all mapped PTEs are to the right huge page */
1320+ for (i = 0 , addr = haddr , pte = start_pte ;
1321+ i < HPAGE_PMD_NR ; i ++ , addr += PAGE_SIZE , pte ++ ) {
1322+ struct page * page ;
1323+
1324+ /* empty pte, skip */
1325+ if (pte_none (* pte ))
1326+ continue ;
1327+
1328+ /* page swapped out, abort */
1329+ if (!pte_present (* pte ))
1330+ goto abort ;
1331+
1332+ page = vm_normal_page (vma , addr , * pte );
1333+
1334+ if (!page || !PageCompound (page ))
1335+ goto abort ;
1336+
1337+ if (!hpage ) {
1338+ hpage = compound_head (page );
1339+ /*
1340+ * The mapping of the THP should not change.
1341+ *
1342+ * Note that uprobe, debugger, or MAP_PRIVATE may
1343+ * change the page table, but the new page will
1344+ * not pass PageCompound() check.
1345+ */
1346+ if (WARN_ON (hpage -> mapping != vma -> vm_file -> f_mapping ))
1347+ goto abort ;
1348+ }
1349+
1350+ /*
1351+ * Confirm the page maps to the correct subpage.
1352+ *
1353+ * Note that uprobe, debugger, or MAP_PRIVATE may change
1354+ * the page table, but the new page will not pass
1355+ * PageCompound() check.
1356+ */
1357+ if (WARN_ON (hpage + i != page ))
1358+ goto abort ;
1359+ count ++ ;
1360+ }
1361+
1362+ /* step 2: adjust rmap */
1363+ for (i = 0 , addr = haddr , pte = start_pte ;
1364+ i < HPAGE_PMD_NR ; i ++ , addr += PAGE_SIZE , pte ++ ) {
1365+ struct page * page ;
1366+
1367+ if (pte_none (* pte ))
1368+ continue ;
1369+ page = vm_normal_page (vma , addr , * pte );
1370+ page_remove_rmap (page , false);
1371+ }
1372+
1373+ pte_unmap_unlock (start_pte , ptl );
1374+
1375+ /* step 3: set proper refcount and mm_counters. */
1376+ if (hpage ) {
1377+ page_ref_sub (hpage , count );
1378+ add_mm_counter (vma -> vm_mm , mm_counter_file (hpage ), - count );
1379+ }
1380+
1381+ /* step 4: collapse pmd */
1382+ ptl = pmd_lock (vma -> vm_mm , pmd );
1383+ _pmd = pmdp_collapse_flush (vma , addr , pmd );
1384+ spin_unlock (ptl );
1385+ mm_dec_nr_ptes (mm );
1386+ pte_free (mm , pmd_pgtable (_pmd ));
1387+ return ;
1388+
1389+ abort :
1390+ pte_unmap_unlock (start_pte , ptl );
1391+ }
1392+
1393+ static int khugepaged_collapse_pte_mapped_thps (struct mm_slot * mm_slot )
1394+ {
1395+ struct mm_struct * mm = mm_slot -> mm ;
1396+ int i ;
1397+
1398+ if (likely (mm_slot -> nr_pte_mapped_thp == 0 ))
1399+ return 0 ;
1400+
1401+ if (!down_write_trylock (& mm -> mmap_sem ))
1402+ return - EBUSY ;
1403+
1404+ if (unlikely (khugepaged_test_exit (mm )))
1405+ goto out ;
1406+
1407+ for (i = 0 ; i < mm_slot -> nr_pte_mapped_thp ; i ++ )
1408+ collapse_pte_mapped_thp (mm , mm_slot -> pte_mapped_thp [i ]);
1409+
1410+ out :
1411+ mm_slot -> nr_pte_mapped_thp = 0 ;
1412+ up_write (& mm -> mmap_sem );
1413+ return 0 ;
1414+ }
1415+
12571416static void retract_page_tables (struct address_space * mapping , pgoff_t pgoff )
12581417{
12591418 struct vm_area_struct * vma ;
@@ -1262,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12621421
12631422 i_mmap_lock_write (mapping );
12641423 vma_interval_tree_foreach (vma , & mapping -> i_mmap , pgoff , pgoff ) {
1265- /* probably overkill */
1424+ /*
1425+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1426+ * got written to. These VMAs are likely not worth investing
1427+ * down_write(mmap_sem) as PMD-mapping is likely to be split
1428+ * later.
1429+ *
1430+ * Not that vma->anon_vma check is racy: it can be set up after
1431+ * the check but before we took mmap_sem by the fault path.
1432+ * But page lock would prevent establishing any new ptes of the
1433+ * page, so we are safe.
1434+ *
1435+ * An alternative would be drop the check, but check that page
1436+ * table is clear before calling pmdp_collapse_flush() under
1437+ * ptl. It has higher chance to recover THP for the VMA, but
1438+ * has higher cost too.
1439+ */
12661440 if (vma -> anon_vma )
12671441 continue ;
12681442 addr = vma -> vm_start + ((pgoff - vma -> vm_pgoff ) << PAGE_SHIFT );
@@ -1275,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12751449 continue ;
12761450 /*
12771451 * We need exclusive mmap_sem to retract page table.
1278- * If trylock fails we would end up with pte-mapped THP after
1279- * re-fault. Not ideal, but it's more important to not disturb
1280- * the system too much.
1452+ *
1453+ * We use trylock due to lock inversion: we need to acquire
1454+ * mmap_sem while holding page lock. Fault path does it in
1455+ * reverse order. Trylock is a way to avoid deadlock.
12811456 */
12821457 if (down_write_trylock (& vma -> vm_mm -> mmap_sem )) {
12831458 spinlock_t * ptl = pmd_lock (vma -> vm_mm , pmd );
@@ -1287,6 +1462,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
12871462 up_write (& vma -> vm_mm -> mmap_sem );
12881463 mm_dec_nr_ptes (vma -> vm_mm );
12891464 pte_free (vma -> vm_mm , pmd_pgtable (_pmd ));
1465+ } else {
1466+ /* Try again later */
1467+ khugepaged_add_pte_mapped_thp (vma -> vm_mm , addr );
12901468 }
12911469 }
12921470 i_mmap_unlock_write (mapping );
@@ -1709,6 +1887,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
17091887{
17101888 BUILD_BUG ();
17111889}
1890+
1891+ static int khugepaged_collapse_pte_mapped_thps (struct mm_slot * mm_slot )
1892+ {
1893+ return 0 ;
1894+ }
17121895#endif
17131896
17141897static unsigned int khugepaged_scan_mm_slot (unsigned int pages ,
@@ -1733,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
17331916 khugepaged_scan .mm_slot = mm_slot ;
17341917 }
17351918 spin_unlock (& khugepaged_mm_lock );
1919+ khugepaged_collapse_pte_mapped_thps (mm_slot );
17361920
17371921 mm = mm_slot -> mm ;
17381922 /*
0 commit comments