Skip to content

Commit f8506c5

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-reduce-memory-usage-for-bpf_global_percpu_ma'
Yonghong Song says: ==================== bpf: Reduce memory usage for bpf_global_percpu_ma Currently when a bpf program intends to allocate memory for percpu kptr, the verifier will call bpf_mem_alloc_init() to prefill all supported unit sizes and this caused memory consumption very big for large number of cpus. For example, for 128-cpu system, the total memory consumption with initial prefill is ~175MB. Things will become worse for systems with even more cpus. Patch 1 avoids unnecessary extra percpu memory allocation. Patch 2 adds objcg to bpf_mem_alloc at init stage so objcg can be associated with root cgroup and objcg can be passed to later bpf_mem_alloc_percpu_unit_init(). Patch 3 addresses memory consumption issue by avoiding to prefill with all unit sizes, i.e. only prefilling with user specified size. Patch 4 further reduces memory consumption by limiting the number of prefill entries for percpu memory allocation. Patch 5 has much smaller low/high watermarks for percpu allocation to reduce memory consumption. Patch 6 rejects percpu memory allocation with bpf_global_percpu_ma when allocation size is greater than 512 bytes. Patch 7 fixed test_bpf_ma test due to Patch 5. Patch 8 added one test to show the verification failure log message. Changelogs: v5 -> v6: . Change bpf_mem_alloc_percpu_init() to add objcg as one of parameters. For bpf_global_percpu_ma, the objcg is NULL, corresponding root memcg. v4 -> v5: . Do not do bpf_global_percpu_ma initialization at init stage, instead doing initialization when the verifier knows it is going to be used by bpf prog. . Using much smaller low/high watermarks for percpu allocation. v3 -> v4: . Add objcg to bpf_mem_alloc during init stage. . Initialize objcg at init stage but use it in bpf_mem_alloc_percpu_unit_init(). . Remove check_obj_size() in bpf_mem_alloc_percpu_unit_init(). v2 -> v3: . Clear the bpf_mem_cache if prefill fails. . Change test_bpf_ma percpu allocation tests to use bucket_size as allocation size instead of bucket_size - 8. . Remove __GFP_ZERO flag from __alloc_percpu_gfp() call. v1 -> v2: . Avoid unnecessary extra percpu memory allocation. . Add a separate function to do bpf_global_percpu_ma initialization . promote. . Promote function static 'sizes' array to file static. . Add comments to explain to refill only one item for percpu alloc. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 417fa6d + adc8c45 commit f8506c5

File tree

6 files changed

+184
-66
lines changed

6 files changed

+184
-66
lines changed

include/linux/bpf_mem_alloc.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
1111
struct bpf_mem_alloc {
1212
struct bpf_mem_caches __percpu *caches;
1313
struct bpf_mem_cache __percpu *cache;
14+
struct obj_cgroup *objcg;
1415
bool percpu;
1516
struct work_struct work;
1617
};
@@ -21,8 +22,15 @@ struct bpf_mem_alloc {
2122
* 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
2223
* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
2324
* the returned object is given by the size argument of bpf_mem_alloc().
25+
* If percpu equals true, error will be returned in order to avoid
26+
* large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
27+
* should be used to do on-demand per-cpu allocation for each size.
2428
*/
2529
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
30+
/* Initialize a non-fix-size percpu memory allocator */
31+
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
32+
/* The percpu allocation with a specific unit size. */
33+
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
2634
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
2735

2836
/* kmalloc/kfree equivalent: */

kernel/bpf/memalloc.c

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ struct bpf_mem_caches {
121121
struct bpf_mem_cache cache[NUM_CACHES];
122122
};
123123

124+
static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
125+
124126
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
125127
{
126128
struct llist_node *entry, *next;
@@ -462,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
462464
* consume ~ 11 Kbyte per cpu.
463465
* Typical case will be between 11K and 116K closer to 11K.
464466
* bpf progs can and should share bpf_mem_cache when possible.
467+
*
468+
* Percpu allocation is typically rare. To avoid potential unnecessary large
469+
* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
465470
*/
466471
static void init_refill_work(struct bpf_mem_cache *c)
467472
{
468473
init_irq_work(&c->refill_work, bpf_mem_refill);
469-
if (c->unit_size <= 256) {
474+
if (c->percpu_size) {
475+
c->low_watermark = 1;
476+
c->high_watermark = 3;
477+
} else if (c->unit_size <= 256) {
470478
c->low_watermark = 32;
471479
c->high_watermark = 96;
472480
} else {
@@ -483,11 +491,16 @@ static void init_refill_work(struct bpf_mem_cache *c)
483491

484492
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
485493
{
486-
/* To avoid consuming memory assume that 1st run of bpf
487-
* prog won't be doing more than 4 map_update_elem from
488-
* irq disabled region
494+
int cnt = 1;
495+
496+
/* To avoid consuming memory, for non-percpu allocation, assume that
497+
* 1st run of bpf prog won't be doing more than 4 map_update_elem from
498+
* irq disabled region if unit size is less than or equal to 256.
499+
* For all other cases, let us just do one allocation.
489500
*/
490-
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
501+
if (!c->percpu_size && c->unit_size <= 256)
502+
cnt = 4;
503+
alloc_bulk(c, cnt, cpu_to_node(cpu), false);
491504
}
492505

493506
/* When size != 0 bpf_mem_cache for each cpu.
@@ -499,12 +512,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
499512
*/
500513
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
501514
{
502-
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
503515
struct bpf_mem_caches *cc, __percpu *pcc;
504516
struct bpf_mem_cache *c, __percpu *pc;
505517
struct obj_cgroup *objcg = NULL;
506518
int cpu, i, unit_size, percpu_size = 0;
507519

520+
if (percpu && size == 0)
521+
return -EINVAL;
522+
508523
/* room for llist_node and per-cpu pointer */
509524
if (percpu)
510525
percpu_size = LLIST_NODE_SZ + sizeof(void *);
@@ -523,6 +538,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
523538
if (memcg_bpf_enabled())
524539
objcg = get_obj_cgroup_from_current();
525540
#endif
541+
ma->objcg = objcg;
542+
526543
for_each_possible_cpu(cpu) {
527544
c = per_cpu_ptr(pc, cpu);
528545
c->unit_size = unit_size;
@@ -542,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
542559
#ifdef CONFIG_MEMCG_KMEM
543560
objcg = get_obj_cgroup_from_current();
544561
#endif
562+
ma->objcg = objcg;
545563
for_each_possible_cpu(cpu) {
546564
cc = per_cpu_ptr(pcc, cpu);
547565
for (i = 0; i < NUM_CACHES; i++) {
@@ -560,6 +578,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
560578
return 0;
561579
}
562580

581+
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
582+
{
583+
struct bpf_mem_caches __percpu *pcc;
584+
585+
pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
586+
if (!pcc)
587+
return -ENOMEM;
588+
589+
ma->caches = pcc;
590+
ma->objcg = objcg;
591+
ma->percpu = true;
592+
return 0;
593+
}
594+
595+
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
596+
{
597+
struct bpf_mem_caches *cc, __percpu *pcc;
598+
int cpu, i, unit_size, percpu_size;
599+
struct obj_cgroup *objcg;
600+
struct bpf_mem_cache *c;
601+
602+
i = bpf_mem_cache_idx(size);
603+
if (i < 0)
604+
return -EINVAL;
605+
606+
/* room for llist_node and per-cpu pointer */
607+
percpu_size = LLIST_NODE_SZ + sizeof(void *);
608+
609+
unit_size = sizes[i];
610+
objcg = ma->objcg;
611+
pcc = ma->caches;
612+
613+
for_each_possible_cpu(cpu) {
614+
cc = per_cpu_ptr(pcc, cpu);
615+
c = &cc->cache[i];
616+
if (cpu == 0 && c->unit_size)
617+
break;
618+
619+
c->unit_size = unit_size;
620+
c->objcg = objcg;
621+
c->percpu_size = percpu_size;
622+
c->tgt = c;
623+
624+
init_refill_work(c);
625+
prefill_mem_cache(c, cpu);
626+
}
627+
628+
return 0;
629+
}
630+
563631
static void drain_mem_cache(struct bpf_mem_cache *c)
564632
{
565633
bool percpu = !!c->percpu_size;
@@ -691,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
691759
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
692760
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
693761
}
694-
/* objcg is the same across cpus */
695-
if (c->objcg)
696-
obj_cgroup_put(c->objcg);
762+
if (ma->objcg)
763+
obj_cgroup_put(ma->objcg);
697764
destroy_mem_alloc(ma, rcu_in_progress);
698765
}
699766
if (ma->caches) {
@@ -709,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
709776
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
710777
}
711778
}
712-
if (c->objcg)
713-
obj_cgroup_put(c->objcg);
779+
if (ma->objcg)
780+
obj_cgroup_put(ma->objcg);
714781
destroy_mem_alloc(ma, rcu_in_progress);
715782
}
716783
}
@@ -833,7 +900,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
833900
if (!size)
834901
return NULL;
835902

836-
idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
903+
if (!ma->percpu)
904+
size += LLIST_NODE_SZ;
905+
idx = bpf_mem_cache_idx(size);
837906
if (idx < 0)
838907
return NULL;
839908

kernel/bpf/verifier.c

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem {
195195
POISON_POINTER_DELTA))
196196
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
197197

198+
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
199+
198200
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
199201
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
200202
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -12139,20 +12141,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
1213912141
if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
1214012142
return -ENOMEM;
1214112143

12142-
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12143-
if (!bpf_global_percpu_ma_set) {
12144-
mutex_lock(&bpf_percpu_ma_lock);
12145-
if (!bpf_global_percpu_ma_set) {
12146-
err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
12147-
if (!err)
12148-
bpf_global_percpu_ma_set = true;
12149-
}
12150-
mutex_unlock(&bpf_percpu_ma_lock);
12151-
if (err)
12152-
return err;
12153-
}
12154-
}
12155-
1215612144
if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
1215712145
verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
1215812146
return -EINVAL;
@@ -12173,6 +12161,35 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
1217312161
return -EINVAL;
1217412162
}
1217512163

12164+
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
12165+
if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
12166+
verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
12167+
ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
12168+
return -EINVAL;
12169+
}
12170+
12171+
if (!bpf_global_percpu_ma_set) {
12172+
mutex_lock(&bpf_percpu_ma_lock);
12173+
if (!bpf_global_percpu_ma_set) {
12174+
/* Charge memory allocated with bpf_global_percpu_ma to
12175+
* root memcg. The obj_cgroup for root memcg is NULL.
12176+
*/
12177+
err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
12178+
if (!err)
12179+
bpf_global_percpu_ma_set = true;
12180+
}
12181+
mutex_unlock(&bpf_percpu_ma_lock);
12182+
if (err)
12183+
return err;
12184+
}
12185+
12186+
mutex_lock(&bpf_percpu_ma_lock);
12187+
err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
12188+
mutex_unlock(&bpf_percpu_ma_lock);
12189+
if (err)
12190+
return err;
12191+
}
12192+
1217612193
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
1217712194
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
1217812195
if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {

tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ static void do_bpf_ma_test(const char *name)
1414
struct test_bpf_ma *skel;
1515
struct bpf_program *prog;
1616
struct btf *btf;
17-
int i, err;
17+
int i, err, id;
18+
char tname[32];
1819

1920
skel = test_bpf_ma__open();
2021
if (!ASSERT_OK_PTR(skel, "open"))
@@ -25,16 +26,21 @@ static void do_bpf_ma_test(const char *name)
2526
goto out;
2627

2728
for (i = 0; i < ARRAY_SIZE(skel->rodata->data_sizes); i++) {
28-
char name[32];
29-
int id;
30-
31-
snprintf(name, sizeof(name), "bin_data_%u", skel->rodata->data_sizes[i]);
32-
id = btf__find_by_name_kind(btf, name, BTF_KIND_STRUCT);
33-
if (!ASSERT_GT(id, 0, "bin_data"))
29+
snprintf(tname, sizeof(tname), "bin_data_%u", skel->rodata->data_sizes[i]);
30+
id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
31+
if (!ASSERT_GT(id, 0, tname))
3432
goto out;
3533
skel->rodata->data_btf_ids[i] = id;
3634
}
3735

36+
for (i = 0; i < ARRAY_SIZE(skel->rodata->percpu_data_sizes); i++) {
37+
snprintf(tname, sizeof(tname), "percpu_bin_data_%u", skel->rodata->percpu_data_sizes[i]);
38+
id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
39+
if (!ASSERT_GT(id, 0, tname))
40+
goto out;
41+
skel->rodata->percpu_data_btf_ids[i] = id;
42+
}
43+
3844
prog = bpf_object__find_program_by_name(skel->obj, name);
3945
if (!ASSERT_OK_PTR(prog, "invalid prog name"))
4046
goto out;

tools/testing/selftests/bpf/progs/percpu_alloc_fail.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ struct val_with_rb_root_t {
1717
struct bpf_spin_lock lock;
1818
};
1919

20+
struct val_600b_t {
21+
char b[600];
22+
};
23+
2024
struct elem {
2125
long sum;
2226
struct val_t __percpu_kptr *pc;
@@ -161,4 +165,18 @@ int BPF_PROG(test_array_map_7)
161165
return 0;
162166
}
163167

168+
SEC("?fentry.s/bpf_fentry_test1")
169+
__failure __msg("bpf_percpu_obj_new type size (600) is greater than 512")
170+
int BPF_PROG(test_array_map_8)
171+
{
172+
struct val_600b_t __percpu_kptr *p;
173+
174+
p = bpf_percpu_obj_new(struct val_600b_t);
175+
if (!p)
176+
return 0;
177+
178+
bpf_percpu_obj_drop(p);
179+
return 0;
180+
}
181+
164182
char _license[] SEC("license") = "GPL";

0 commit comments

Comments
 (0)