Skip to content

Commit d9c1e1c

Browse files
Xu KuohaiKernel Patches Daemon
Xu Kuohai
authored and
Kernel Patches Daemon
committed
bpf, arm64: Jit BPF_CALL to direct call when possible
Currently, BPF_CALL is always jited to indirect call. When target is within the range of direct call, BPF_CALL can be jited to direct call. For example, the following BPF_CALL call __htab_map_lookup_elem is always jited to indirect call: mov x10, #0xffffffffffff18f4 movk x10, #0x821, lsl #16 movk x10, #0x8000, lsl #32 blr x10 When the address of target __htab_map_lookup_elem is within the range of direct call, the BPF_CALL can be jited to: bl 0xfffffffffd33bc98 This patch does such jit optimization by emitting arm64 direct calls for BPF_CALL when possible, indirect calls otherwise. Without this patch, the jit works as follows. 1. First pass A. Determine jited position and size for each bpf instruction. B. Computed the jited image size. 2. Allocate jited image with size computed in step 1. 3. Second pass A. Adjust jump offset for jump instructions B. Write the final image. This works because, for a given bpf prog, regardless of where the jited image is allocated, the jited result for each instruction is fixed. The second pass differs from the first only in adjusting the jump offsets, like changing "jmp imm1" to "jmp imm2", while the position and size of the "jmp" instruction remain unchanged. Now considering whether to jit BPF_CALL to arm64 direct or indirect call instruction. The choice depends solely on the jump offset: direct call if the jump offset is within 128MB, indirect call otherwise. For a given BPF_CALL, the target address is known, so the jump offset is decided by the jited address of the BPF_CALL instruction. In other words, for a given bpf prog, the jited result for each BPF_CALL is determined by its jited address. The jited address for a BPF_CALL is the jited image address plus the total jited size of all preceding instructions. For a given bpf prog, there are clearly no BPF_CALL instructions before the first BPF_CALL instruction. Since the jited result for all other instructions other than BPF_CALL are fixed, the total jited size preceding the first BPF_CALL is also fixed. Therefore, once the jited image is allocated, the jited address for the first BPF_CALL is fixed. Now that the jited result for the first BPF_CALL is fixed, the jited results for all instructions preceding the second BPF_CALL are fixed. So the jited address and result for the second BPF_CALL are also fixed. Similarly, we can conclude that the jited addresses and results for all subsequent BPF_CALL instructions are fixed. This means that, for a given bpf prog, once the jited image is allocated, the jited address and result for all instructions, including all BPF_CALL instructions, are fixed. Based on the observation, with this patch, the jit works as follows. 1. First pass Estimate the maximum jited image size. In this pass, all BPF_CALLs are jited to arm64 indirect calls since the jump offsets are unknown because the jited image is not allocated. 2. Allocate jited image with size estimated in step 1. 3. Second pass A. Determine the jited result for each BPF_CALL. B. Determine jited address and size for each bpf instruction. 4. Third pass A. Adjust jump offset for jump instructions. B. Write the final image. Signed-off-by: Xu Kuohai <[email protected]>
1 parent 069f2d9 commit d9c1e1c

File tree

1 file changed

+75
-16
lines changed

1 file changed

+75
-16
lines changed

arch/arm64/net/bpf_jit_comp.c

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct jit_ctx {
8484
u64 user_vm_start;
8585
u64 arena_vm_start;
8686
bool fp_used;
87+
bool write;
8788
};
8889

8990
struct bpf_plt {
@@ -97,7 +98,7 @@ struct bpf_plt {
9798

9899
static inline void emit(const u32 insn, struct jit_ctx *ctx)
99100
{
100-
if (ctx->image != NULL)
101+
if (ctx->image != NULL && ctx->write)
101102
ctx->image[ctx->idx] = cpu_to_le32(insn);
102103

103104
ctx->idx++;
@@ -182,14 +183,47 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
182183
}
183184
}
184185

185-
static inline void emit_call(u64 target, struct jit_ctx *ctx)
186+
static bool should_emit_indirect_call(long target, const struct jit_ctx *ctx)
186187
{
187-
u8 tmp = bpf2a64[TMP_REG_1];
188+
long offset;
188189

190+
/* when ctx->ro_image is not allocated or the target is unknown,
191+
* emit indirect call
192+
*/
193+
if (!ctx->ro_image || !target)
194+
return true;
195+
196+
offset = target - (long)&ctx->ro_image[ctx->idx];
197+
return offset < -SZ_128M || offset >= SZ_128M;
198+
}
199+
200+
static void emit_direct_call(u64 target, struct jit_ctx *ctx)
201+
{
202+
u32 insn;
203+
unsigned long pc;
204+
205+
pc = (unsigned long)&ctx->ro_image[ctx->idx];
206+
insn = aarch64_insn_gen_branch_imm(pc, target, AARCH64_INSN_BRANCH_LINK);
207+
emit(insn, ctx);
208+
}
209+
210+
static void emit_indirect_call(u64 target, struct jit_ctx *ctx)
211+
{
212+
u8 tmp;
213+
214+
tmp = bpf2a64[TMP_REG_1];
189215
emit_addr_mov_i64(tmp, target, ctx);
190216
emit(A64_BLR(tmp), ctx);
191217
}
192218

219+
static void emit_call(u64 target, struct jit_ctx *ctx)
220+
{
221+
if (should_emit_indirect_call((long)target, ctx))
222+
emit_indirect_call(target, ctx);
223+
else
224+
emit_direct_call(target, ctx);
225+
}
226+
193227
static inline int bpf2a64_offset(int bpf_insn, int off,
194228
const struct jit_ctx *ctx)
195229
{
@@ -1649,13 +1683,11 @@ static int build_body(struct jit_ctx *ctx, bool extra_pass)
16491683
const struct bpf_insn *insn = &prog->insnsi[i];
16501684
int ret;
16511685

1652-
if (ctx->image == NULL)
1653-
ctx->offset[i] = ctx->idx;
1686+
ctx->offset[i] = ctx->idx;
16541687
ret = build_insn(insn, ctx, extra_pass);
16551688
if (ret > 0) {
16561689
i++;
1657-
if (ctx->image == NULL)
1658-
ctx->offset[i] = ctx->idx;
1690+
ctx->offset[i] = ctx->idx;
16591691
continue;
16601692
}
16611693
if (ret)
@@ -1666,8 +1698,7 @@ static int build_body(struct jit_ctx *ctx, bool extra_pass)
16661698
* the last element with the offset after the last
16671699
* instruction (end of program)
16681700
*/
1669-
if (ctx->image == NULL)
1670-
ctx->offset[i] = ctx->idx;
1701+
ctx->offset[i] = ctx->idx;
16711702

16721703
return 0;
16731704
}
@@ -1721,6 +1752,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
17211752
struct jit_ctx ctx;
17221753
u8 *image_ptr;
17231754
u8 *ro_image_ptr;
1755+
int body_idx;
1756+
int exentry_idx;
17241757

17251758
if (!prog->jit_requested)
17261759
return orig_prog;
@@ -1768,8 +1801,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
17681801
ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
17691802
ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
17701803

1771-
/*
1772-
* 1. Initial fake pass to compute ctx->idx and ctx->offset.
1804+
/* Pass 1: Estimate the maximum image size.
17731805
*
17741806
* BPF line info needs ctx->offset[i] to be the offset of
17751807
* instruction[i] in jited image, so build prologue first.
@@ -1792,7 +1824,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
17921824
extable_size = prog->aux->num_exentries *
17931825
sizeof(struct exception_table_entry);
17941826

1795-
/* Now we know the actual image size. */
1827+
/* Now we know the maximum image size. */
17961828
prog_size = sizeof(u32) * ctx.idx;
17971829
/* also allocate space for plt target */
17981830
extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align);
@@ -1805,7 +1837,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
18051837
goto out_off;
18061838
}
18071839

1808-
/* 2. Now, the actual pass. */
1840+
/* Pass 2: Determine jited position and result for each instruction */
18091841

18101842
/*
18111843
* Use the image(RW) for writing the JITed instructions. But also save
@@ -1821,30 +1853,56 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
18211853
skip_init_ctx:
18221854
ctx.idx = 0;
18231855
ctx.exentry_idx = 0;
1856+
ctx.write = true;
18241857

18251858
build_prologue(&ctx, was_classic);
18261859

1860+
/* Record exentry_idx and body_idx before first build_body */
1861+
exentry_idx = ctx.exentry_idx;
1862+
body_idx = ctx.idx;
1863+
/* Dont write body instructions to memory for now */
1864+
ctx.write = false;
1865+
18271866
if (build_body(&ctx, extra_pass)) {
18281867
prog = orig_prog;
18291868
goto out_free_hdr;
18301869
}
18311870

1871+
ctx.epilogue_offset = ctx.idx;
1872+
ctx.exentry_idx = exentry_idx;
1873+
ctx.idx = body_idx;
1874+
ctx.write = true;
1875+
1876+
/* Pass 3: Adjust jump offset and write final image */
1877+
if (build_body(&ctx, extra_pass) ||
1878+
WARN_ON_ONCE(ctx.idx != ctx.epilogue_offset)) {
1879+
prog = orig_prog;
1880+
goto out_free_hdr;
1881+
}
1882+
18321883
build_epilogue(&ctx);
18331884
build_plt(&ctx);
18341885

1835-
/* 3. Extra pass to validate JITed code. */
1886+
/* Extra pass to validate JITed code. */
18361887
if (validate_ctx(&ctx)) {
18371888
prog = orig_prog;
18381889
goto out_free_hdr;
18391890
}
18401891

1892+
/* update the real prog size */
1893+
prog_size = sizeof(u32) * ctx.idx;
1894+
18411895
/* And we're done. */
18421896
if (bpf_jit_enable > 1)
18431897
bpf_jit_dump(prog->len, prog_size, 2, ctx.image);
18441898

18451899
if (!prog->is_func || extra_pass) {
1846-
if (extra_pass && ctx.idx != jit_data->ctx.idx) {
1847-
pr_err_once("multi-func JIT bug %d != %d\n",
1900+
/* The jited image may shrink since the jited result for
1901+
* BPF_CALL to subprog may be changed from indirect call
1902+
* to direct call.
1903+
*/
1904+
if (extra_pass && ctx.idx > jit_data->ctx.idx) {
1905+
pr_err_once("multi-func JIT bug %d > %d\n",
18481906
ctx.idx, jit_data->ctx.idx);
18491907
prog->bpf_func = NULL;
18501908
prog->jited = 0;
@@ -2315,6 +2373,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
23152373
.image = image,
23162374
.ro_image = ro_image,
23172375
.idx = 0,
2376+
.write = true,
23182377
};
23192378

23202379
nregs = btf_func_model_nregs(m);

0 commit comments

Comments
 (0)