Skip to content

Commit cf264e1

Browse files
nhatsmrtakpm00
authored andcommitted
cachestat: implement cachestat syscall
There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/[email protected]/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include <sys/mman.h> struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [[email protected]: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Nhat Pham <[email protected]> Acked-by: Johannes Weiner <[email protected]> Cc: Brian Foster <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Michael Kerrisk <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ffcb5f5 commit cf264e1

File tree

8 files changed

+207
-1
lines changed

8 files changed

+207
-1
lines changed

arch/x86/entry/syscalls/syscall_32.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,3 +455,4 @@
455455
448 i386 process_mrelease sys_process_mrelease
456456
449 i386 futex_waitv sys_futex_waitv
457457
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
458+
451 i386 cachestat sys_cachestat

arch/x86/entry/syscalls/syscall_64.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@
372372
448 common process_mrelease sys_process_mrelease
373373
449 common futex_waitv sys_futex_waitv
374374
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
375+
451 common cachestat sys_cachestat
375376

376377
#
377378
# Due to a historical design error, certain syscalls are numbered differently

include/linux/syscalls.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ struct open_how;
7272
struct mount_attr;
7373
struct landlock_ruleset_attr;
7474
enum landlock_rule_type;
75+
struct cachestat_range;
76+
struct cachestat;
7577

7678
#include <linux/types.h>
7779
#include <linux/aio_abi.h>
@@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
10581060
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
10591061
unsigned long home_node,
10601062
unsigned long flags);
1063+
asmlinkage long sys_cachestat(unsigned int fd,
1064+
struct cachestat_range __user *cstat_range,
1065+
struct cachestat __user *cstat, unsigned int flags);
10611066

10621067
/*
10631068
* Architecture-specific system calls

include/uapi/asm-generic/unistd.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
886886
#define __NR_set_mempolicy_home_node 450
887887
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
888888

889+
#define __NR_cachestat 451
890+
__SYSCALL(__NR_cachestat, sys_cachestat)
891+
889892
#undef __NR_syscalls
890-
#define __NR_syscalls 451
893+
#define __NR_syscalls 452
891894

892895
/*
893896
* 32 bit systems traditionally used different

include/uapi/linux/mman.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <asm/mman.h>
66
#include <asm-generic/hugetlb_encode.h>
7+
#include <linux/types.h>
78

89
#define MREMAP_MAYMOVE 1
910
#define MREMAP_FIXED 2
@@ -41,4 +42,17 @@
4142
#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
4243
#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
4344

45+
struct cachestat_range {
46+
__u64 off;
47+
__u64 len;
48+
};
49+
50+
struct cachestat {
51+
__u64 nr_cache;
52+
__u64 nr_dirty;
53+
__u64 nr_writeback;
54+
__u64 nr_evicted;
55+
__u64 nr_recently_evicted;
56+
};
57+
4458
#endif /* _UAPI_LINUX_MMAN_H */

init/Kconfig

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,16 @@ config RSEQ
17711771

17721772
If unsure, say Y.
17731773

1774+
config CACHESTAT_SYSCALL
1775+
bool "Enable cachestat() system call" if EXPERT
1776+
default y
1777+
help
1778+
Enable the cachestat system call, which queries the page cache
1779+
statistics of a file (number of cached pages, dirty pages,
1780+
pages marked for writeback, (recently) evicted pages).
1781+
1782+
If unsure say Y here.
1783+
17741784
config DEBUG_RSEQ
17751785
default n
17761786
bool "Enabled debugging of rseq() system call" if EXPERT

kernel/sys_ni.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy);
299299
COND_SYSCALL(migrate_pages);
300300
COND_SYSCALL(move_pages);
301301
COND_SYSCALL(set_mempolicy_home_node);
302+
COND_SYSCALL(cachestat);
302303

303304
COND_SYSCALL(perf_event_open);
304305
COND_SYSCALL(accept4);

mm/filemap.c

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <linux/mm.h>
2323
#include <linux/swap.h>
2424
#include <linux/swapops.h>
25+
#include <linux/syscalls.h>
2526
#include <linux/mman.h>
2627
#include <linux/pagemap.h>
2728
#include <linux/file.h>
@@ -58,6 +59,8 @@
5859

5960
#include <asm/mman.h>
6061

62+
#include "swap.h"
63+
6164
/*
6265
* Shared mappings implemented 30.11.1994. It's not fully working yet,
6366
* though.
@@ -4119,3 +4122,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
41194122
return try_to_free_buffers(folio);
41204123
}
41214124
EXPORT_SYMBOL(filemap_release_folio);
4125+
4126+
#ifdef CONFIG_CACHESTAT_SYSCALL
4127+
/**
4128+
* filemap_cachestat() - compute the page cache statistics of a mapping
4129+
* @mapping: The mapping to compute the statistics for.
4130+
* @first_index: The starting page cache index.
4131+
* @last_index: The final page index (inclusive).
4132+
* @cs: the cachestat struct to write the result to.
4133+
*
4134+
* This will query the page cache statistics of a mapping in the
4135+
* page range of [first_index, last_index] (inclusive). The statistics
4136+
* queried include: number of dirty pages, number of pages marked for
4137+
* writeback, and the number of (recently) evicted pages.
4138+
*/
4139+
static void filemap_cachestat(struct address_space *mapping,
4140+
pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
4141+
{
4142+
XA_STATE(xas, &mapping->i_pages, first_index);
4143+
struct folio *folio;
4144+
4145+
rcu_read_lock();
4146+
xas_for_each(&xas, folio, last_index) {
4147+
unsigned long nr_pages;
4148+
pgoff_t folio_first_index, folio_last_index;
4149+
4150+
if (xas_retry(&xas, folio))
4151+
continue;
4152+
4153+
if (xa_is_value(folio)) {
4154+
/* page is evicted */
4155+
void *shadow = (void *)folio;
4156+
bool workingset; /* not used */
4157+
int order = xa_get_order(xas.xa, xas.xa_index);
4158+
4159+
nr_pages = 1 << order;
4160+
folio_first_index = round_down(xas.xa_index, 1 << order);
4161+
folio_last_index = folio_first_index + nr_pages - 1;
4162+
4163+
/* Folios might straddle the range boundaries, only count covered pages */
4164+
if (folio_first_index < first_index)
4165+
nr_pages -= first_index - folio_first_index;
4166+
4167+
if (folio_last_index > last_index)
4168+
nr_pages -= folio_last_index - last_index;
4169+
4170+
cs->nr_evicted += nr_pages;
4171+
4172+
#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
4173+
if (shmem_mapping(mapping)) {
4174+
/* shmem file - in swap cache */
4175+
swp_entry_t swp = radix_to_swp_entry(folio);
4176+
4177+
shadow = get_shadow_from_swap_cache(swp);
4178+
}
4179+
#endif
4180+
if (workingset_test_recent(shadow, true, &workingset))
4181+
cs->nr_recently_evicted += nr_pages;
4182+
4183+
goto resched;
4184+
}
4185+
4186+
nr_pages = folio_nr_pages(folio);
4187+
folio_first_index = folio_pgoff(folio);
4188+
folio_last_index = folio_first_index + nr_pages - 1;
4189+
4190+
/* Folios might straddle the range boundaries, only count covered pages */
4191+
if (folio_first_index < first_index)
4192+
nr_pages -= first_index - folio_first_index;
4193+
4194+
if (folio_last_index > last_index)
4195+
nr_pages -= folio_last_index - last_index;
4196+
4197+
/* page is in cache */
4198+
cs->nr_cache += nr_pages;
4199+
4200+
if (folio_test_dirty(folio))
4201+
cs->nr_dirty += nr_pages;
4202+
4203+
if (folio_test_writeback(folio))
4204+
cs->nr_writeback += nr_pages;
4205+
4206+
resched:
4207+
if (need_resched()) {
4208+
xas_pause(&xas);
4209+
cond_resched_rcu();
4210+
}
4211+
}
4212+
rcu_read_unlock();
4213+
}
4214+
4215+
/*
4216+
* The cachestat(2) system call.
4217+
*
4218+
* cachestat() returns the page cache statistics of a file in the
4219+
* bytes range specified by `off` and `len`: number of cached pages,
4220+
* number of dirty pages, number of pages marked for writeback,
4221+
* number of evicted pages, and number of recently evicted pages.
4222+
*
4223+
* An evicted page is a page that is previously in the page cache
4224+
* but has been evicted since. A page is recently evicted if its last
4225+
* eviction was recent enough that its reentry to the cache would
4226+
* indicate that it is actively being used by the system, and that
4227+
* there is memory pressure on the system.
4228+
*
4229+
* `off` and `len` must be non-negative integers. If `len` > 0,
4230+
* the queried range is [`off`, `off` + `len`]. If `len` == 0,
4231+
* we will query in the range from `off` to the end of the file.
4232+
*
4233+
* The `flags` argument is unused for now, but is included for future
4234+
* extensibility. User should pass 0 (i.e no flag specified).
4235+
*
4236+
* Currently, hugetlbfs is not supported.
4237+
*
4238+
* Because the status of a page can change after cachestat() checks it
4239+
* but before it returns to the application, the returned values may
4240+
* contain stale information.
4241+
*
4242+
* return values:
4243+
* zero - success
4244+
* -EFAULT - cstat or cstat_range points to an illegal address
4245+
* -EINVAL - invalid flags
4246+
* -EBADF - invalid file descriptor
4247+
* -EOPNOTSUPP - file descriptor is of a hugetlbfs file
4248+
*/
4249+
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
4250+
struct cachestat_range __user *, cstat_range,
4251+
struct cachestat __user *, cstat, unsigned int, flags)
4252+
{
4253+
struct fd f = fdget(fd);
4254+
struct address_space *mapping;
4255+
struct cachestat_range csr;
4256+
struct cachestat cs;
4257+
pgoff_t first_index, last_index;
4258+
4259+
if (!f.file)
4260+
return -EBADF;
4261+
4262+
if (copy_from_user(&csr, cstat_range,
4263+
sizeof(struct cachestat_range))) {
4264+
fdput(f);
4265+
return -EFAULT;
4266+
}
4267+
4268+
/* hugetlbfs is not supported */
4269+
if (is_file_hugepages(f.file)) {
4270+
fdput(f);
4271+
return -EOPNOTSUPP;
4272+
}
4273+
4274+
if (flags != 0) {
4275+
fdput(f);
4276+
return -EINVAL;
4277+
}
4278+
4279+
first_index = csr.off >> PAGE_SHIFT;
4280+
last_index =
4281+
csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
4282+
memset(&cs, 0, sizeof(struct cachestat));
4283+
mapping = f.file->f_mapping;
4284+
filemap_cachestat(mapping, first_index, last_index, &cs);
4285+
fdput(f);
4286+
4287+
if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
4288+
return -EFAULT;
4289+
4290+
return 0;
4291+
}
4292+
#endif /* CONFIG_CACHESTAT_SYSCALL */

0 commit comments

Comments
 (0)