Skip to content

Commit ef4b313

Browse files
Yonghong SongKernel Patches Daemon
authored andcommitted
bpf: Implement mprog API on top of existing cgroup progs
Current cgroup prog ordering is appending at attachment time. This is not ideal. In some cases, users want specific ordering at a particular cgroup level. To address this, the existing mprog API seems an ideal solution with supporting BPF_F_BEFORE and BPF_F_AFTER flags. But there are a few obstacles to directly use kernel mprog interface. Currently cgroup bpf progs already support prog attach/detach/replace and link-based attach/detach/replace. For example, in struct bpf_prog_array_item, the cgroup_storage field needs to be together with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog as the member, which makes it difficult to use kernel mprog interface. In another case, the current cgroup prog detach tries to use the same flag as in attach. This is different from mprog kernel interface which uses flags passed from user space. So to avoid modifying existing behavior, I made the following changes to support mprog API for cgroup progs: - The support is for prog list at cgroup level. Cross-level prog list (a.k.a. effective prog list) is not supported. - Previously, BPF_F_PREORDER is supported only for prog attach, now BPF_F_PREORDER is also supported by link-based attach. - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID is supported similar to kernel mprog but with different implementation. - For detach and replace, use the existing implementation. - For attach, detach and replace, the revision for a particular prog list, associated with a particular attach type, will be updated by increasing count by 1. Signed-off-by: Yonghong Song <[email protected]>
1 parent 7f665c9 commit ef4b313

File tree

4 files changed

+165
-37
lines changed

4 files changed

+165
-37
lines changed

include/uapi/linux/bpf.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1794,6 +1794,13 @@ union bpf_attr {
17941794
};
17951795
__u64 expected_revision;
17961796
} netkit;
1797+
struct {
1798+
union {
1799+
__u32 relative_fd;
1800+
__u32 relative_id;
1801+
};
1802+
__u64 expected_revision;
1803+
} cgroup;
17971804
};
17981805
} link_create;
17991806

kernel/bpf/cgroup.c

Lines changed: 122 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,83 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
624624
return NULL;
625625
}
626626

627+
static struct bpf_prog *get_anchor_prog(struct hlist_head *progs, struct bpf_prog *prog,
628+
u32 flags, u32 id_or_fd, struct bpf_prog_list **ppltmp)
629+
{
630+
struct bpf_prog *anchor_prog = NULL, *pltmp_prog;
631+
bool preorder = flags & BPF_F_PREORDER;
632+
struct bpf_prog_list *pltmp;
633+
bool id = flags & BPF_F_ID;
634+
int ret = -EINVAL;
635+
636+
if (id || id_or_fd) {
637+
/* flags must have BPF_F_BEFORE or BPF_F_AFTER */
638+
if (!(flags & (BPF_F_BEFORE | BPF_F_AFTER)))
639+
return ERR_PTR(-EINVAL);
640+
641+
if (id)
642+
anchor_prog = bpf_prog_by_id(id_or_fd);
643+
else
644+
anchor_prog = bpf_prog_get(id_or_fd);
645+
if (IS_ERR(anchor_prog))
646+
return anchor_prog;
647+
if (anchor_prog->type != prog->type)
648+
goto out;
649+
}
650+
651+
if (!anchor_prog) {
652+
hlist_for_each_entry(pltmp, progs, node) {
653+
if ((flags & BPF_F_BEFORE) && *ppltmp)
654+
break;
655+
*ppltmp = pltmp;
656+
}
657+
} else {
658+
hlist_for_each_entry(pltmp, progs, node) {
659+
pltmp_prog = pltmp->link ? pltmp->link->link.prog : pltmp->prog;
660+
if (pltmp_prog != anchor_prog)
661+
continue;
662+
if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
663+
goto out;
664+
*ppltmp = pltmp;
665+
break;
666+
}
667+
if (!*ppltmp) {
668+
ret = -ENOENT;
669+
goto out;
670+
}
671+
}
672+
673+
return anchor_prog;
674+
675+
out:
676+
bpf_prog_put(anchor_prog);
677+
return ERR_PTR(ret);
678+
}
679+
680+
static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
681+
struct bpf_prog *prog, u32 flags, u32 id_or_fd)
682+
{
683+
struct bpf_prog_list *pltmp = NULL;
684+
struct bpf_prog *anchor_prog;
685+
686+
/* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
687+
if ((flags & BPF_F_BEFORE) && (flags & BPF_F_AFTER))
688+
return -EINVAL;
689+
690+
anchor_prog = get_anchor_prog(progs, prog, flags, id_or_fd, &pltmp);
691+
if (IS_ERR(anchor_prog))
692+
return PTR_ERR(anchor_prog);
693+
694+
if (hlist_empty(progs))
695+
hlist_add_head(&pl->node, progs);
696+
else if (flags & BPF_F_BEFORE)
697+
hlist_add_before(&pl->node, &pltmp->node);
698+
else
699+
hlist_add_behind(&pl->node, &pltmp->node);
700+
701+
return 0;
702+
}
703+
627704
/**
628705
* __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
629706
* propagate the change to descendants
@@ -633,14 +710,17 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
633710
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
634711
* @type: Type of attach operation
635712
* @flags: Option flags
713+
* @id_or_fd: Relative prog id or fd
714+
* @revision: bpf_prog_list revision
636715
*
637716
* Exactly one of @prog or @link can be non-null.
638717
* Must be called with cgroup_mutex held.
639718
*/
640719
static int __cgroup_bpf_attach(struct cgroup *cgrp,
641720
struct bpf_prog *prog, struct bpf_prog *replace_prog,
642721
struct bpf_cgroup_link *link,
643-
enum bpf_attach_type type, u32 flags)
722+
enum bpf_attach_type type, u32 flags, u32 id_or_fd,
723+
u64 revision)
644724
{
645725
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
646726
struct bpf_prog *old_prog = NULL;
@@ -656,16 +736,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
656736
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
657737
/* invalid combination */
658738
return -EINVAL;
739+
if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
740+
/* only either replace or insertion with before/after */
741+
return -EINVAL;
659742
if (link && (prog || replace_prog))
660743
/* only either link or prog/replace_prog can be specified */
661744
return -EINVAL;
662745
if (!!replace_prog != !!(flags & BPF_F_REPLACE))
663746
/* replace_prog implies BPF_F_REPLACE, and vice versa */
664747
return -EINVAL;
665748

749+
666750
atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
667751
if (atype < 0)
668752
return -EINVAL;
753+
if (revision && revision != cgrp->bpf.revisions[atype])
754+
return -ESTALE;
669755

670756
progs = &cgrp->bpf.progs[atype];
671757

@@ -694,22 +780,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
694780
if (pl) {
695781
old_prog = pl->prog;
696782
} else {
697-
struct hlist_node *last = NULL;
698-
699783
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
700784
if (!pl) {
701785
bpf_cgroup_storages_free(new_storage);
702786
return -ENOMEM;
703787
}
704-
if (hlist_empty(progs))
705-
hlist_add_head(&pl->node, progs);
706-
else
707-
hlist_for_each(last, progs) {
708-
if (last->next)
709-
continue;
710-
hlist_add_behind(&pl->node, last);
711-
break;
712-
}
788+
789+
err = insert_pl_to_hlist(pl, progs, prog ? : link->link.prog, flags, id_or_fd);
790+
if (err) {
791+
kfree(pl);
792+
bpf_cgroup_storages_free(new_storage);
793+
return err;
794+
}
713795
}
714796

715797
pl->prog = prog;
@@ -728,6 +810,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
728810
if (err)
729811
goto cleanup_trampoline;
730812

813+
cgrp->bpf.revisions[atype] += 1;
731814
if (old_prog) {
732815
if (type == BPF_LSM_CGROUP)
733816
bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -759,12 +842,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
759842
struct bpf_prog *prog, struct bpf_prog *replace_prog,
760843
struct bpf_cgroup_link *link,
761844
enum bpf_attach_type type,
762-
u32 flags)
845+
u32 flags, u32 id_or_fd, u64 revision)
763846
{
764847
int ret;
765848

766849
cgroup_lock();
767-
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
850+
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
851+
id_or_fd, revision);
768852
cgroup_unlock();
769853
return ret;
770854
}
@@ -852,6 +936,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
852936
if (!found)
853937
return -ENOENT;
854938

939+
cgrp->bpf.revisions[atype] += 1;
855940
old_prog = xchg(&link->link.prog, new_prog);
856941
replace_effective_prog(cgrp, atype, link);
857942
bpf_prog_put(old_prog);
@@ -977,12 +1062,14 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
9771062
* @prog: A program to detach or NULL
9781063
* @link: A link to detach or NULL
9791064
* @type: Type of detach operation
1065+
* @revision: bpf_prog_list revision
9801066
*
9811067
* At most one of @prog or @link can be non-NULL.
9821068
* Must be called with cgroup_mutex held.
9831069
*/
9841070
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
985-
struct bpf_cgroup_link *link, enum bpf_attach_type type)
1071+
struct bpf_cgroup_link *link, enum bpf_attach_type type,
1072+
u64 revision)
9861073
{
9871074
enum cgroup_bpf_attach_type atype;
9881075
struct bpf_prog *old_prog;
@@ -1000,6 +1087,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
10001087
if (atype < 0)
10011088
return -EINVAL;
10021089

1090+
if (revision && revision != cgrp->bpf.revisions[atype])
1091+
return -ESTALE;
1092+
10031093
progs = &cgrp->bpf.progs[atype];
10041094
flags = cgrp->bpf.flags[atype];
10051095

@@ -1025,6 +1115,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
10251115

10261116
/* now can actually delete it from this cgroup list */
10271117
hlist_del(&pl->node);
1118+
cgrp->bpf.revisions[atype] += 1;
10281119

10291120
kfree(pl);
10301121
if (hlist_empty(progs))
@@ -1040,12 +1131,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
10401131
}
10411132

10421133
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1043-
enum bpf_attach_type type)
1134+
enum bpf_attach_type type, u64 revision)
10441135
{
10451136
int ret;
10461137

10471138
cgroup_lock();
1048-
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
1139+
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
10491140
cgroup_unlock();
10501141
return ret;
10511142
}
@@ -1063,6 +1154,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
10631154
struct bpf_prog_array *effective;
10641155
int cnt, ret = 0, i;
10651156
int total_cnt = 0;
1157+
u64 revision = 0;
10661158
u32 flags;
10671159

10681160
if (effective_query && prog_attach_flags)
@@ -1100,6 +1192,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
11001192
return -EFAULT;
11011193
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
11021194
return -EFAULT;
1195+
if (!effective_query && from_atype == to_atype)
1196+
revision = cgrp->bpf.revisions[from_atype];
1197+
if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
1198+
return -EFAULT;
11031199
if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
11041200
/* return early if user requested only program count + flags */
11051201
return 0;
@@ -1182,7 +1278,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
11821278
}
11831279

11841280
ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1185-
attr->attach_type, attr->attach_flags);
1281+
attr->attach_type, attr->attach_flags,
1282+
attr->relative_fd, attr->expected_revision);
11861283

11871284
if (replace_prog)
11881285
bpf_prog_put(replace_prog);
@@ -1204,7 +1301,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
12041301
if (IS_ERR(prog))
12051302
prog = NULL;
12061303

1207-
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
1304+
ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
12081305
if (prog)
12091306
bpf_prog_put(prog);
12101307

@@ -1233,7 +1330,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
12331330
}
12341331

12351332
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1236-
cg_link->type));
1333+
cg_link->type, 0));
12371334
if (cg_link->type == BPF_LSM_CGROUP)
12381335
bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
12391336

@@ -1312,7 +1409,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
13121409
struct cgroup *cgrp;
13131410
int err;
13141411

1315-
if (attr->link_create.flags)
1412+
if (attr->link_create.flags &&
1413+
(attr->link_create.flags & (~(BPF_F_ID | BPF_F_BEFORE | BPF_F_AFTER | BPF_F_PREORDER))))
13161414
return -EINVAL;
13171415

13181416
cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -1336,7 +1434,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
13361434
}
13371435

13381436
err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1339-
link->type, BPF_F_ALLOW_MULTI);
1437+
link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
1438+
attr->link_create.cgroup.relative_fd,
1439+
attr->link_create.cgroup.expected_revision);
13401440
if (err) {
13411441
bpf_link_cleanup(&link_primer);
13421442
goto out_put_cgroup;

kernel/bpf/syscall.c

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4184,6 +4184,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
41844184
}
41854185
}
41864186

4187+
static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
4188+
bool check_atype)
4189+
{
4190+
switch (ptype) {
4191+
case BPF_PROG_TYPE_CGROUP_DEVICE:
4192+
case BPF_PROG_TYPE_CGROUP_SKB:
4193+
case BPF_PROG_TYPE_CGROUP_SOCK:
4194+
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4195+
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4196+
case BPF_PROG_TYPE_CGROUP_SYSCTL:
4197+
case BPF_PROG_TYPE_SOCK_OPS:
4198+
return true;
4199+
case BPF_PROG_TYPE_LSM:
4200+
return check_atype ? atype == BPF_LSM_CGROUP : true;
4201+
default:
4202+
return false;
4203+
}
4204+
}
4205+
41874206
#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
41884207

41894208
#define BPF_F_ATTACH_MASK_BASE \
@@ -4214,6 +4233,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
42144233
if (bpf_mprog_supported(ptype)) {
42154234
if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
42164235
return -EINVAL;
4236+
} else if (is_cgroup_prog_type(ptype, 0, false)) {
4237+
if (attr->attach_flags & BPF_F_LINK)
4238+
return -EINVAL;
42174239
} else {
42184240
if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
42194241
return -EINVAL;
@@ -4242,20 +4264,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
42424264
case BPF_PROG_TYPE_FLOW_DISSECTOR:
42434265
ret = netns_bpf_prog_attach(attr, prog);
42444266
break;
4245-
case BPF_PROG_TYPE_CGROUP_DEVICE:
4246-
case BPF_PROG_TYPE_CGROUP_SKB:
4247-
case BPF_PROG_TYPE_CGROUP_SOCK:
4248-
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4249-
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4250-
case BPF_PROG_TYPE_CGROUP_SYSCTL:
4251-
case BPF_PROG_TYPE_SOCK_OPS:
4252-
case BPF_PROG_TYPE_LSM:
4253-
if (ptype == BPF_PROG_TYPE_LSM &&
4254-
prog->expected_attach_type != BPF_LSM_CGROUP)
4255-
ret = -EINVAL;
4256-
else
4257-
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
4258-
break;
42594267
case BPF_PROG_TYPE_SCHED_CLS:
42604268
if (attr->attach_type == BPF_TCX_INGRESS ||
42614269
attr->attach_type == BPF_TCX_EGRESS)
@@ -4264,7 +4272,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
42644272
ret = netkit_prog_attach(attr, prog);
42654273
break;
42664274
default:
4267-
ret = -EINVAL;
4275+
if (!is_cgroup_prog_type(ptype, prog->expected_attach_type, true))
4276+
ret = -EINVAL;
4277+
else
4278+
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
42684279
}
42694280

42704281
if (ret)
@@ -4294,6 +4305,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
42944305
if (IS_ERR(prog))
42954306
return PTR_ERR(prog);
42964307
}
4308+
} else if (is_cgroup_prog_type(ptype, 0, false)) {
4309+
if (attr->attach_flags || attr->relative_fd)
4310+
return -EINVAL;
42974311
} else if (attr->attach_flags ||
42984312
attr->relative_fd ||
42994313
attr->expected_revision) {

0 commit comments

Comments
 (0)