@@ -2394,8 +2394,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
23942394}
23952395
23962396#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2397- static int gup_pte_range (pmd_t pmd , unsigned long addr , unsigned long end ,
2398- unsigned int flags , struct page * * pages , int * nr )
2397+ /*
2398+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
2399+ * operations.
2400+ *
2401+ * To pin the page, fast-gup needs to do below in order:
2402+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2403+ *
2404+ * For the rest of pgtable operations where pgtable updates can be racy
2405+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
2406+ * is pinned.
2407+ *
2408+ * Above will work for all pte-level operations, including THP split.
2409+ *
2410+ * For THP collapse, it's a bit more complicated because fast-gup may be
2411+ * walking a pgtable page that is being freed (pte is still valid but pmd
2412+ * can be cleared already). To avoid race in such condition, we need to
2413+ * also check pmd here to make sure pmd doesn't change (corresponds to
2414+ * pmdp_collapse_flush() in the THP collapse code path).
2415+ */
2416+ static int gup_pte_range (pmd_t pmd , pmd_t * pmdp , unsigned long addr ,
2417+ unsigned long end , unsigned int flags ,
2418+ struct page * * pages , int * nr )
23992419{
24002420 struct dev_pagemap * pgmap = NULL ;
24012421 int nr_start = * nr , ret = 0 ;
@@ -2441,7 +2461,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
24412461 goto pte_unmap ;
24422462 }
24432463
2444- if (unlikely (pte_val (pte ) != pte_val (* ptep ))) {
2464+ if (unlikely (pmd_val (pmd ) != pmd_val (* pmdp )) ||
2465+ unlikely (pte_val (pte ) != pte_val (* ptep ))) {
24452466 gup_put_folio (folio , 1 , flags );
24462467 goto pte_unmap ;
24472468 }
@@ -2488,8 +2509,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
24882509 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
24892510 * useful to have gup_huge_pmd even if we can't operate on ptes.
24902511 */
2491- static int gup_pte_range (pmd_t pmd , unsigned long addr , unsigned long end ,
2492- unsigned int flags , struct page * * pages , int * nr )
2512+ static int gup_pte_range (pmd_t pmd , pmd_t * pmdp , unsigned long addr ,
2513+ unsigned long end , unsigned int flags ,
2514+ struct page * * pages , int * nr )
24932515{
24942516 return 0 ;
24952517}
@@ -2813,7 +2835,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
28132835 if (!gup_huge_pd (__hugepd (pmd_val (pmd )), addr ,
28142836 PMD_SHIFT , next , flags , pages , nr ))
28152837 return 0 ;
2816- } else if (!gup_pte_range (pmd , addr , next , flags , pages , nr ))
2838+ } else if (!gup_pte_range (pmd , pmdp , addr , next , flags , pages , nr ))
28172839 return 0 ;
28182840 } while (pmdp ++ , addr = next , addr != end );
28192841
0 commit comments