Skip to content

Commit 26aa2d1

Browse files
hansendctorvalds
authored andcommitted
mm/migrate: demote pages during reclaim
This is mostly derived from a patch from Yang Shi: https://lore.kernel.org/linux-mm/[email protected]/ Add code to the reclaim path (shrink_page_list()) to "demote" data to another NUMA node instead of discarding the data. This always avoids the cost of I/O needed to read the page back in and sometimes avoids the writeout cost when the page is dirty. A second pass through shrink_page_list() will be made if any demotions fail. This essentially falls back to normal reclaim behavior in the case that demotions fail. Previous versions of this patch may have simply failed to reclaim pages which were eligible for demotion but were unable to be demoted in practice. For some cases, for example, MADV_PAGEOUT, the pages are always discarded instead of demoted to follow the kernel API definition. Because MADV_PAGEOUT is defined as freeing specified pages regardless in which tier they are. Note: This just adds the start of infrastructure for migration. It is actually disabled next to the FIXME in migrate_demote_page_ok(). [[email protected]: v11] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Dave Hansen <[email protected]> Signed-off-by: "Huang, Ying" <[email protected]> Reviewed-by: Yang Shi <[email protected]> Reviewed-by: Wei Xu <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Reviewed-by: Zi Yan <[email protected]> Cc: Michal Hocko <[email protected]> Cc: David Rientjes <[email protected]> Cc: Dan Williams <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Greg Thelen <[email protected]> Cc: Keith Busch <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 5ac9588 commit 26aa2d1

File tree

3 files changed

+96
-1
lines changed

3 files changed

+96
-1
lines changed

include/linux/migrate.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enum migrate_reason {
2828
MR_NUMA_MISPLACED,
2929
MR_CONTIG_RANGE,
3030
MR_LONGTERM_PIN,
31+
MR_DEMOTION,
3132
MR_TYPES
3233
};
3334

@@ -167,6 +168,14 @@ struct migrate_vma {
167168
int migrate_vma_setup(struct migrate_vma *args);
168169
void migrate_vma_pages(struct migrate_vma *migrate);
169170
void migrate_vma_finalize(struct migrate_vma *migrate);
171+
int next_demotion_node(int node);
172+
173+
#else /* CONFIG_MIGRATION disabled: */
174+
175+
static inline int next_demotion_node(int node)
176+
{
177+
return NUMA_NO_NODE;
178+
}
170179

171180
#endif /* CONFIG_MIGRATION */
172181

include/trace/events/migrate.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
2222
EM( MR_NUMA_MISPLACED, "numa_misplaced") \
2323
EM( MR_CONTIG_RANGE, "contig_range") \
24-
EMe(MR_LONGTERM_PIN, "longterm_pin")
24+
EM( MR_LONGTERM_PIN, "longterm_pin") \
25+
EMe(MR_DEMOTION, "demotion")
2526

2627
/*
2728
* First define the enums in the above macros to be exported to userspace

mm/vmscan.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/kthread.h>
4242
#include <linux/freezer.h>
4343
#include <linux/memcontrol.h>
44+
#include <linux/migrate.h>
4445
#include <linux/delayacct.h>
4546
#include <linux/sysctl.h>
4647
#include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
121122
/* The file pages on the current node are dangerously low */
122123
unsigned int file_is_tiny:1;
123124

125+
/* Always discard instead of demoting to lower tier memory */
126+
unsigned int no_demotion:1;
127+
124128
/* Allocation order */
125129
s8 order;
126130

@@ -518,6 +522,17 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
518522
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
519523
}
520524

525+
static bool can_demote(int nid, struct scan_control *sc)
526+
{
527+
if (sc->no_demotion)
528+
return false;
529+
if (next_demotion_node(nid) == NUMA_NO_NODE)
530+
return false;
531+
532+
// FIXME: actually enable this later in the series
533+
return false;
534+
}
535+
521536
/*
522537
* This misses isolated pages which are not accounted for to save counters.
523538
* As the data only determines if reclaim or compaction continues, it is
@@ -1263,6 +1278,49 @@ static void page_check_dirty_writeback(struct page *page,
12631278
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
12641279
}
12651280

1281+
static struct page *alloc_demote_page(struct page *page, unsigned long node)
1282+
{
1283+
struct migration_target_control mtc = {
1284+
/*
1285+
* Allocate from 'node', or fail quickly and quietly.
1286+
* When this happens, 'page' will likely just be discarded
1287+
* instead of migrated.
1288+
*/
1289+
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
1290+
__GFP_THISNODE | __GFP_NOWARN |
1291+
__GFP_NOMEMALLOC | GFP_NOWAIT,
1292+
.nid = node
1293+
};
1294+
1295+
return alloc_migration_target(page, (unsigned long)&mtc);
1296+
}
1297+
1298+
/*
1299+
* Take pages on @demote_list and attempt to demote them to
1300+
* another node. Pages which are not demoted are left on
1301+
* @demote_pages.
1302+
*/
1303+
static unsigned int demote_page_list(struct list_head *demote_pages,
1304+
struct pglist_data *pgdat)
1305+
{
1306+
int target_nid = next_demotion_node(pgdat->node_id);
1307+
unsigned int nr_succeeded;
1308+
int err;
1309+
1310+
if (list_empty(demote_pages))
1311+
return 0;
1312+
1313+
if (target_nid == NUMA_NO_NODE)
1314+
return 0;
1315+
1316+
/* Demotion ignores all cpuset and mempolicy settings */
1317+
err = migrate_pages(demote_pages, alloc_demote_page, NULL,
1318+
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
1319+
&nr_succeeded);
1320+
1321+
return nr_succeeded;
1322+
}
1323+
12661324
/*
12671325
* shrink_page_list() returns the number of reclaimed pages
12681326
*/
@@ -1274,12 +1332,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
12741332
{
12751333
LIST_HEAD(ret_pages);
12761334
LIST_HEAD(free_pages);
1335+
LIST_HEAD(demote_pages);
12771336
unsigned int nr_reclaimed = 0;
12781337
unsigned int pgactivate = 0;
1338+
bool do_demote_pass;
12791339

12801340
memset(stat, 0, sizeof(*stat));
12811341
cond_resched();
1342+
do_demote_pass = can_demote(pgdat->node_id, sc);
12821343

1344+
retry:
12831345
while (!list_empty(page_list)) {
12841346
struct address_space *mapping;
12851347
struct page *page;
@@ -1428,6 +1490,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
14281490
; /* try to reclaim the page below */
14291491
}
14301492

1493+
/*
1494+
* Before reclaiming the page, try to relocate
1495+
* its contents to another node.
1496+
*/
1497+
if (do_demote_pass &&
1498+
(thp_migration_supported() || !PageTransHuge(page))) {
1499+
list_add(&page->lru, &demote_pages);
1500+
unlock_page(page);
1501+
continue;
1502+
}
1503+
14311504
/*
14321505
* Anonymous process memory has backing store?
14331506
* Try to allocate it some swap space here.
@@ -1679,6 +1752,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
16791752
list_add(&page->lru, &ret_pages);
16801753
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
16811754
}
1755+
/* 'page_list' is always empty here */
1756+
1757+
/* Migrate pages selected for demotion */
1758+
nr_reclaimed += demote_page_list(&demote_pages, pgdat);
1759+
/* Pages that could not be demoted are still in @demote_pages */
1760+
if (!list_empty(&demote_pages)) {
1761+
/* Pages which failed to demoted go back on @page_list for retry: */
1762+
list_splice_init(&demote_pages, page_list);
1763+
do_demote_pass = false;
1764+
goto retry;
1765+
}
16821766

16831767
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
16841768

@@ -2326,6 +2410,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
23262410
.may_writepage = 1,
23272411
.may_unmap = 1,
23282412
.may_swap = 1,
2413+
.no_demotion = 1,
23292414
};
23302415

23312416
noreclaim_flag = memalloc_noreclaim_save();

0 commit comments

Comments
 (0)