Skip to content

Commit a65b0e7

Browse files
lellomanakpm00
authored andcommitted
zswap: make shrinking memcg-aware
Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/[email protected]/T/#u This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic: a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion. [[email protected]: use correct function for the onlineness check, use mem_cgroup_iter_break()] Link: https://lkml.kernel.org/r/[email protected] [[email protected]: drop the pool's reference at the end of the writeback step] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Domenico Cerasuolo <[email protected]> Co-developed-by: Nhat Pham <[email protected]> Signed-off-by: Nhat Pham <[email protected]> Tested-by: Bagas Sanjaya <[email protected]> Cc: Chris Li <[email protected]> Cc: Dan Streetman <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Muchun Song <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Seth Jennings <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Shuah Khan <[email protected]> Cc: Vitaly Wool <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent fdc4161 commit a65b0e7

File tree

6 files changed

+245
-59
lines changed

6 files changed

+245
-59
lines changed

include/linux/memcontrol.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
11921192
return NULL;
11931193
}
11941194

1195+
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
1196+
{
1197+
return NULL;
1198+
}
1199+
11951200
static inline bool folio_memcg_kmem(struct folio *folio)
11961201
{
11971202
return false;

include/linux/zswap.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
1515
void zswap_invalidate(int type, pgoff_t offset);
1616
void zswap_swapon(int type);
1717
void zswap_swapoff(int type);
18+
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
1819

1920
#else
2021

@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
3132
static inline void zswap_invalidate(int type, pgoff_t offset) {}
3233
static inline void zswap_swapon(int type) {}
3334
static inline void zswap_swapoff(int type) {}
35+
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
3436

3537
#endif
3638

mm/memcontrol.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5614,6 +5614,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
56145614
page_counter_set_min(&memcg->memory, 0);
56155615
page_counter_set_low(&memcg->memory, 0);
56165616

5617+
zswap_memcg_offline_cleanup(memcg);
5618+
56175619
memcg_offline_kmem(memcg);
56185620
reparent_shrinker_deferred(memcg);
56195621
wb_memcg_offline(memcg);

mm/swap.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
5151
struct swap_iocb **plug);
5252
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
5353
struct mempolicy *mpol, pgoff_t ilx,
54-
bool *new_page_allocated);
54+
bool *new_page_allocated,
55+
bool skip_if_exists);
5556
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
5657
struct mempolicy *mpol, pgoff_t ilx);
5758
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,

mm/swap_state.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
412412

413413
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
414414
struct mempolicy *mpol, pgoff_t ilx,
415-
bool *new_page_allocated)
415+
bool *new_page_allocated,
416+
bool skip_if_exists)
416417
{
417418
struct swap_info_struct *si;
418419
struct folio *folio;
@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
470471
if (err != -EEXIST)
471472
goto fail_put_swap;
472473

474+
/*
475+
* Protect against a recursive call to __read_swap_cache_async()
476+
* on the same entry waiting forever here because SWAP_HAS_CACHE
477+
* is set but the folio is not the swap cache yet. This can
478+
* happen today if mem_cgroup_swapin_charge_folio() below
479+
* triggers reclaim through zswap, which may call
480+
* __read_swap_cache_async() in the writeback path.
481+
*/
482+
if (skip_if_exists)
483+
goto fail_put_swap;
484+
473485
/*
474486
* We might race against __delete_from_swap_cache(), and
475487
* stumble across a swap_map entry whose SWAP_HAS_CACHE
@@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
537549

538550
mpol = get_vma_policy(vma, addr, 0, &ilx);
539551
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
540-
&page_allocated);
552+
&page_allocated, false);
541553
mpol_cond_put(mpol);
542554

543555
if (page_allocated)
@@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
654666
/* Ok, do the async read-ahead now */
655667
page = __read_swap_cache_async(
656668
swp_entry(swp_type(entry), offset),
657-
gfp_mask, mpol, ilx, &page_allocated);
669+
gfp_mask, mpol, ilx, &page_allocated, false);
658670
if (!page)
659671
continue;
660672
if (page_allocated) {
@@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
672684
skip:
673685
/* The page was likely read above, so no need for plugging here */
674686
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
675-
&page_allocated);
687+
&page_allocated, false);
676688
if (unlikely(page_allocated))
677689
swap_readpage(page, false, NULL);
678690
return page;
@@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
827839
pte_unmap(pte);
828840
pte = NULL;
829841
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
830-
&page_allocated);
842+
&page_allocated, false);
831843
if (!page)
832844
continue;
833845
if (page_allocated) {
@@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
847859
skip:
848860
/* The page was likely read above, so no need for plugging here */
849861
page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
850-
&page_allocated);
862+
&page_allocated, false);
851863
if (unlikely(page_allocated))
852864
swap_readpage(page, false, NULL);
853865
return page;

0 commit comments

Comments
 (0)