Skip to content

Commit 6b24ca4

Browse files
author
Matthew Wilcox (Oracle)
committed
mm: Use multi-index entries in the page cache
We currently store large folios as 2^N consecutive entries. While this consumes rather more memory than necessary, it also turns out to be buggy. A writeback operation which starts within a tail page of a dirty folio will not write back the folio as the xarray's dirty bit is only set on the head index. With multi-index entries, the dirty bit will be found no matter where in the folio the operation starts. This does end up simplifying the page cache slightly, although not as much as I had hoped. Signed-off-by: Matthew Wilcox (Oracle) <[email protected]> Reviewed-by: William Kucharski <[email protected]>
1 parent 25a8de7 commit 6b24ca4

File tree

6 files changed

+72
-56
lines changed

6 files changed

+72
-56
lines changed

include/linux/pagemap.h

-10
Original file line numberDiff line numberDiff line change
@@ -1125,16 +1125,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
11251125
VM_BUG_ON_PAGE(PageTail(page), page);
11261126
array[i++] = page;
11271127
rac->_batch_count += thp_nr_pages(page);
1128-
1129-
/*
1130-
* The page cache isn't using multi-index entries yet,
1131-
* so the xas cursor needs to be manually moved to the
1132-
* next index. This can be removed once the page cache
1133-
* is converted.
1134-
*/
1135-
if (PageHead(page))
1136-
xas_set(&xas, rac->_index + rac->_batch_count);
1137-
11381128
if (i == array_sz)
11391129
break;
11401130
}

mm/filemap.c

+39-22
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ static void page_cache_delete(struct address_space *mapping,
135135
}
136136

137137
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
138-
VM_BUG_ON_FOLIO(nr != 1 && shadow, folio);
139138

140139
xas_store(&xas, shadow);
141140
xas_init_marks(&xas);
@@ -286,7 +285,7 @@ static void page_cache_delete_batch(struct address_space *mapping,
286285
struct folio_batch *fbatch)
287286
{
288287
XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
289-
int total_pages = 0;
288+
long total_pages = 0;
290289
int i = 0;
291290
struct folio *folio;
292291

@@ -313,18 +312,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
313312

314313
WARN_ON_ONCE(!folio_test_locked(folio));
315314

316-
if (folio->index == xas.xa_index)
317-
folio->mapping = NULL;
315+
folio->mapping = NULL;
318316
/* Leave folio->index set: truncation lookup relies on it */
319317

320-
/*
321-
* Move to the next folio in the batch if this is a regular
322-
* folio or the index is of the last sub-page of this folio.
323-
*/
324-
if (folio->index + folio_nr_pages(folio) - 1 == xas.xa_index)
325-
i++;
318+
i++;
326319
xas_store(&xas, NULL);
327-
total_pages++;
320+
total_pages += folio_nr_pages(folio);
328321
}
329322
mapping->nrpages -= total_pages;
330323
}
@@ -2089,24 +2082,27 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
20892082
indices[fbatch->nr] = xas.xa_index;
20902083
if (!folio_batch_add(fbatch, folio))
20912084
break;
2092-
goto next;
2085+
continue;
20932086
unlock:
20942087
folio_unlock(folio);
20952088
put:
20962089
folio_put(folio);
2097-
next:
2098-
if (!xa_is_value(folio) && folio_test_large(folio)) {
2099-
xas_set(&xas, folio->index + folio_nr_pages(folio));
2100-
/* Did we wrap on 32-bit? */
2101-
if (!xas.xa_index)
2102-
break;
2103-
}
21042090
}
21052091
rcu_read_unlock();
21062092

21072093
return folio_batch_count(fbatch);
21082094
}
21092095

2096+
static inline
2097+
bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
2098+
{
2099+
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
2100+
return false;
2101+
if (index >= max)
2102+
return false;
2103+
return index < folio->index + folio_nr_pages(folio) - 1;
2104+
}
2105+
21102106
/**
21112107
* find_get_pages_range - gang pagecache lookup
21122108
* @mapping: The address_space to search
@@ -2145,11 +2141,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
21452141
if (xa_is_value(folio))
21462142
continue;
21472143

2144+
again:
21482145
pages[ret] = folio_file_page(folio, xas.xa_index);
21492146
if (++ret == nr_pages) {
21502147
*start = xas.xa_index + 1;
21512148
goto out;
21522149
}
2150+
if (folio_more_pages(folio, xas.xa_index, end)) {
2151+
xas.xa_index++;
2152+
folio_ref_inc(folio);
2153+
goto again;
2154+
}
21532155
}
21542156

21552157
/*
@@ -2207,9 +2209,15 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
22072209
if (unlikely(folio != xas_reload(&xas)))
22082210
goto put_page;
22092211

2210-
pages[ret] = &folio->page;
2212+
again:
2213+
pages[ret] = folio_file_page(folio, xas.xa_index);
22112214
if (++ret == nr_pages)
22122215
break;
2216+
if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
2217+
xas.xa_index++;
2218+
folio_ref_inc(folio);
2219+
goto again;
2220+
}
22132221
continue;
22142222
put_page:
22152223
folio_put(folio);
@@ -2334,8 +2342,7 @@ static void filemap_get_read_batch(struct address_space *mapping,
23342342
break;
23352343
if (folio_test_readahead(folio))
23362344
break;
2337-
xas.xa_index = folio->index + folio_nr_pages(folio) - 1;
2338-
xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
2345+
xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
23392346
continue;
23402347
put_folio:
23412348
folio_put(folio);
@@ -3284,6 +3291,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
32843291
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
32853292
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
32863293
do {
3294+
again:
32873295
page = folio_file_page(folio, xas.xa_index);
32883296
if (PageHWPoison(page))
32893297
goto unlock;
@@ -3305,9 +3313,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
33053313
do_set_pte(vmf, page, addr);
33063314
/* no need to invalidate: a not-present page won't be cached */
33073315
update_mmu_cache(vma, addr, vmf->pte);
3316+
if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
3317+
xas.xa_index++;
3318+
folio_ref_inc(folio);
3319+
goto again;
3320+
}
33083321
folio_unlock(folio);
33093322
continue;
33103323
unlock:
3324+
if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
3325+
xas.xa_index++;
3326+
goto again;
3327+
}
33113328
folio_unlock(folio);
33123329
folio_put(folio);
33133330
} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);

mm/huge_memory.c

+14-4
Original file line numberDiff line numberDiff line change
@@ -2614,6 +2614,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
26142614
{
26152615
struct page *head = compound_head(page);
26162616
struct deferred_split *ds_queue = get_deferred_split_queue(head);
2617+
XA_STATE(xas, &head->mapping->i_pages, head->index);
26172618
struct anon_vma *anon_vma = NULL;
26182619
struct address_space *mapping = NULL;
26192620
int extra_pins, ret;
@@ -2652,6 +2653,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
26522653
goto out;
26532654
}
26542655

2656+
xas_split_alloc(&xas, head, compound_order(head),
2657+
mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
2658+
if (xas_error(&xas)) {
2659+
ret = xas_error(&xas);
2660+
goto out;
2661+
}
2662+
26552663
anon_vma = NULL;
26562664
i_mmap_lock_read(mapping);
26572665

@@ -2681,13 +2689,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
26812689
/* block interrupt reentry in xa_lock and spinlock */
26822690
local_irq_disable();
26832691
if (mapping) {
2684-
XA_STATE(xas, &mapping->i_pages, page_index(head));
2685-
26862692
/*
26872693
* Check if the head page is present in page cache.
26882694
* We assume all tail are present too, if head is there.
26892695
*/
2690-
xa_lock(&mapping->i_pages);
2696+
xas_lock(&xas);
2697+
xas_reset(&xas);
26912698
if (xas_load(&xas) != head)
26922699
goto fail;
26932700
}
@@ -2703,6 +2710,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
27032710
if (mapping) {
27042711
int nr = thp_nr_pages(head);
27052712

2713+
xas_split(&xas, head, thp_order(head));
27062714
if (PageSwapBacked(head)) {
27072715
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
27082716
-nr);
@@ -2719,7 +2727,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
27192727
spin_unlock(&ds_queue->split_queue_lock);
27202728
fail:
27212729
if (mapping)
2722-
xa_unlock(&mapping->i_pages);
2730+
xas_unlock(&xas);
27232731
local_irq_enable();
27242732
remap_page(head, thp_nr_pages(head));
27252733
ret = -EBUSY;
@@ -2733,6 +2741,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
27332741
if (mapping)
27342742
i_mmap_unlock_read(mapping);
27352743
out:
2744+
/* Free any memory we didn't use */
2745+
xas_nomem(&xas, 0);
27362746
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
27372747
return ret;
27382748
}

mm/khugepaged.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -1667,7 +1667,10 @@ static void collapse_file(struct mm_struct *mm,
16671667
}
16681668
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
16691669

1670-
/* This will be less messy when we use multi-index entries */
1670+
/*
1671+
* Ensure we have slots for all the pages in the range. This is
1672+
* almost certainly a no-op because most of the pages must be present
1673+
*/
16711674
do {
16721675
xas_lock_irq(&xas);
16731676
xas_create_range(&xas);
@@ -1892,6 +1895,9 @@ static void collapse_file(struct mm_struct *mm,
18921895
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
18931896
}
18941897

1898+
/* Join all the small entries into a single multi-index entry */
1899+
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
1900+
xas_store(&xas, new_page);
18951901
xa_locked:
18961902
xas_unlock_irq(&xas);
18971903
xa_unlocked:
@@ -2013,6 +2019,10 @@ static void khugepaged_scan_file(struct mm_struct *mm,
20132019
continue;
20142020
}
20152021

2022+
/*
2023+
* XXX: khugepaged should compact smaller compound pages
2024+
* into a PMD sized page
2025+
*/
20162026
if (PageTransCompound(page)) {
20172027
result = SCAN_PAGE_COMPOUND;
20182028
break;

mm/migrate.c

-8
Original file line numberDiff line numberDiff line change
@@ -433,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping,
433433
}
434434

435435
xas_store(&xas, newfolio);
436-
if (nr > 1) {
437-
int i;
438-
439-
for (i = 1; i < nr; i++) {
440-
xas_next(&xas);
441-
xas_store(&xas, newfolio);
442-
}
443-
}
444436

445437
/*
446438
* Drop cache reference from old page by unfreezing

mm/shmem.c

+8-11
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,6 @@ static int shmem_add_to_page_cache(struct page *page,
694694
struct mm_struct *charge_mm)
695695
{
696696
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
697-
unsigned long i = 0;
698697
unsigned long nr = compound_nr(page);
699698
int error;
700699

@@ -721,20 +720,18 @@ static int shmem_add_to_page_cache(struct page *page,
721720
cgroup_throttle_swaprate(page, gfp);
722721

723722
do {
724-
void *entry;
725723
xas_lock_irq(&xas);
726-
entry = xas_find_conflict(&xas);
727-
if (entry != expected)
724+
if (expected != xas_find_conflict(&xas)) {
725+
xas_set_err(&xas, -EEXIST);
726+
goto unlock;
727+
}
728+
if (expected && xas_find_conflict(&xas)) {
728729
xas_set_err(&xas, -EEXIST);
729-
xas_create_range(&xas);
730-
if (xas_error(&xas))
731730
goto unlock;
732-
next:
733-
xas_store(&xas, page);
734-
if (++i < nr) {
735-
xas_next(&xas);
736-
goto next;
737731
}
732+
xas_store(&xas, page);
733+
if (xas_error(&xas))
734+
goto unlock;
738735
if (PageTransHuge(page)) {
739736
count_vm_event(THP_FILE_ALLOC);
740737
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);

0 commit comments

Comments
 (0)