Skip to content

Commit 70cbc3c

Browse files
yang-shiakpm00
authored andcommitted
mm: gup: fix the fast GUP race against THP collapse
Since general RCU GUP fast was introduced in commit 2667f50 ("mm: introduce a general RCU get_user_pages_fast()"), a TLB flush is no longer sufficient to handle concurrent GUP-fast in all cases, it only handles traditional IPI-based GUP-fast correctly. On architectures that send an IPI broadcast on TLB flush, it works as expected. But on the architectures that do not use IPI to broadcast TLB flush, it may have the below race: CPU A CPU B THP collapse fast GUP gup_pmd_range() <-- see valid pmd gup_pte_range() <-- work on pte pmdp_collapse_flush() <-- clear pmd and flush __collapse_huge_page_isolate() check page pinned <-- before GUP bump refcount pin the page check PTE <-- no change __collapse_huge_page_copy() copy data to huge page ptep_clear() install huge pmd for the huge page return the stale page discard the stale page The race can be fixed by checking whether PMD is changed or not after taking the page pin in fast GUP, just like what it does for PTE. If the PMD is changed it means there may be parallel THP collapse, so GUP should back off. Also update the stale comment about serializing against fast GUP in khugepaged. Link: https://lkml.kernel.org/r/[email protected] Fixes: 2667f50 ("mm: introduce a general RCU get_user_pages_fast()") Acked-by: David Hildenbrand <[email protected]> Acked-by: Peter Xu <[email protected]> Signed-off-by: Yang Shi <[email protected]> Reviewed-by: John Hubbard <[email protected]> Cc: "Aneesh Kumar K.V" <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Nicholas Piggin <[email protected]> Cc: Christophe Leroy <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 4eb5bbd commit 70cbc3c

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

mm/gup.c

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
23452345
}
23462346

23472347
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2348-
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2349-
unsigned int flags, struct page **pages, int *nr)
2348+
/*
2349+
* Fast-gup relies on pte change detection to avoid concurrent pgtable
2350+
* operations.
2351+
*
2352+
* To pin the page, fast-gup needs to do below in order:
2353+
* (1) pin the page (by prefetching pte), then (2) check pte not changed.
2354+
*
2355+
* For the rest of pgtable operations where pgtable updates can be racy
2356+
* with fast-gup, we need to do (1) clear pte, then (2) check whether page
2357+
* is pinned.
2358+
*
2359+
* Above will work for all pte-level operations, including THP split.
2360+
*
2361+
* For THP collapse, it's a bit more complicated because fast-gup may be
2362+
* walking a pgtable page that is being freed (pte is still valid but pmd
2363+
* can be cleared already). To avoid race in such condition, we need to
2364+
* also check pmd here to make sure pmd doesn't change (corresponds to
2365+
* pmdp_collapse_flush() in the THP collapse code path).
2366+
*/
2367+
static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2368+
unsigned long end, unsigned int flags,
2369+
struct page **pages, int *nr)
23502370
{
23512371
struct dev_pagemap *pgmap = NULL;
23522372
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
23922412
goto pte_unmap;
23932413
}
23942414

2395-
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2415+
if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2416+
unlikely(pte_val(pte) != pte_val(*ptep))) {
23962417
gup_put_folio(folio, 1, flags);
23972418
goto pte_unmap;
23982419
}
@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
24392460
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
24402461
* useful to have gup_huge_pmd even if we can't operate on ptes.
24412462
*/
2442-
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
2443-
unsigned int flags, struct page **pages, int *nr)
2463+
static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2464+
unsigned long end, unsigned int flags,
2465+
struct page **pages, int *nr)
24442466
{
24452467
return 0;
24462468
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
27642786
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
27652787
PMD_SHIFT, next, flags, pages, nr))
27662788
return 0;
2767-
} else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2789+
} else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
27682790
return 0;
27692791
} while (pmdp++, addr = next, addr != end);
27702792

mm/khugepaged.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
10831083

10841084
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
10851085
/*
1086-
* After this gup_fast can't run anymore. This also removes
1087-
* any huge TLB entry from the CPU so we won't allow
1088-
* huge and small TLB entries for the same virtual address
1089-
* to avoid the risk of CPU bugs in that area.
1086+
* This removes any huge TLB entry from the CPU so we won't allow
1087+
* huge and small TLB entries for the same virtual address to
1088+
* avoid the risk of CPU bugs in that area.
1089+
*
1090+
* Parallel fast GUP is fine since fast GUP will back off when
1091+
* it detects PMD is changed.
10901092
*/
10911093
_pmd = pmdp_collapse_flush(vma, address, pmd);
10921094
spin_unlock(pmd_ptl);

0 commit comments

Comments
 (0)