Skip to content

Commit 3249fe4

Browse files
committed
Merge tag 'trace-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
Pull tracing fixes from Steven Rostedt: - Fix off-by-one error in retrieving the context buffer for trace_printk() - Fix off-by-one error in stack nesting limit - Fix recursion to not make all NMI code false positive as recursing - Stop losing events in function tracing when transitioning between irq context - Stop losing events in ring buffer when transitioning between irq context - Fix return code of error pointer in parse_synth_field() to prevent NULL pointer dereference. - Fix false positive of NMI recursion in kprobe event handling * tag 'trace-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace: kprobes: Tell lockdep about kprobe nesting tracing: Make -ENOMEM the default error for parse_synth_field() ring-buffer: Fix recursion protection transitions between interrupt context tracing: Fix the checking of stackidx in __ftrace_trace_stack ftrace: Handle tracing when switching between context ftrace: Fix recursion check for NMI test tracing: Fix out of bounds write in get_trace_buf
2 parents 6732b35 + 645f224 commit 3249fe4

File tree

6 files changed

+107
-34
lines changed

6 files changed

+107
-34
lines changed

kernel/kprobes.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,7 +1249,13 @@ __acquires(hlist_lock)
12491249

12501250
*head = &kretprobe_inst_table[hash];
12511251
hlist_lock = kretprobe_table_lock_ptr(hash);
1252-
raw_spin_lock_irqsave(hlist_lock, *flags);
1252+
/*
1253+
* Nested is a workaround that will soon not be needed.
1254+
* There's other protections that make sure the same lock
1255+
* is not taken on the same CPU that lockdep is unaware of.
1256+
* Differentiate when it is taken in NMI context.
1257+
*/
1258+
raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
12531259
}
12541260
NOKPROBE_SYMBOL(kretprobe_hash_lock);
12551261

@@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash,
12581264
__acquires(hlist_lock)
12591265
{
12601266
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
1261-
raw_spin_lock_irqsave(hlist_lock, *flags);
1267+
/*
1268+
* Nested is a workaround that will soon not be needed.
1269+
* There's other protections that make sure the same lock
1270+
* is not taken on the same CPU that lockdep is unaware of.
1271+
* Differentiate when it is taken in NMI context.
1272+
*/
1273+
raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
12621274
}
12631275
NOKPROBE_SYMBOL(kretprobe_table_lock);
12641276

@@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
20282040

20292041
/* TODO: consider to only swap the RA after the last pre_handler fired */
20302042
hash = hash_ptr(current, KPROBE_HASH_BITS);
2031-
raw_spin_lock_irqsave(&rp->lock, flags);
2043+
/*
2044+
* Nested is a workaround that will soon not be needed.
2045+
* There's other protections that make sure the same lock
2046+
* is not taken on the same CPU that lockdep is unaware of.
2047+
*/
2048+
raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
20322049
if (!hlist_empty(&rp->free_instances)) {
20332050
ri = hlist_entry(rp->free_instances.first,
20342051
struct kretprobe_instance, hlist);
@@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
20392056
ri->task = current;
20402057

20412058
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
2042-
raw_spin_lock_irqsave(&rp->lock, flags);
2059+
raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
20432060
hlist_add_head(&ri->hlist, &rp->free_instances);
20442061
raw_spin_unlock_irqrestore(&rp->lock, flags);
20452062
return 0;

kernel/trace/ring_buffer.c

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -438,14 +438,16 @@ enum {
438438
};
439439
/*
440440
* Used for which event context the event is in.
441-
* NMI = 0
442-
* IRQ = 1
443-
* SOFTIRQ = 2
444-
* NORMAL = 3
441+
* TRANSITION = 0
442+
* NMI = 1
443+
* IRQ = 2
444+
* SOFTIRQ = 3
445+
* NORMAL = 4
445446
*
446447
* See trace_recursive_lock() comment below for more details.
447448
*/
448449
enum {
450+
RB_CTX_TRANSITION,
449451
RB_CTX_NMI,
450452
RB_CTX_IRQ,
451453
RB_CTX_SOFTIRQ,
@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
30143016
* a bit of overhead in something as critical as function tracing,
30153017
* we use a bitmask trick.
30163018
*
3017-
* bit 0 = NMI context
3018-
* bit 1 = IRQ context
3019-
* bit 2 = SoftIRQ context
3020-
* bit 3 = normal context.
3019+
* bit 1 = NMI context
3020+
* bit 2 = IRQ context
3021+
* bit 3 = SoftIRQ context
3022+
* bit 4 = normal context.
30213023
*
30223024
* This works because this is the order of contexts that can
30233025
* preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
30403042
* The least significant bit can be cleared this way, and it
30413043
* just so happens that it is the same bit corresponding to
30423044
* the current context.
3045+
*
3046+
* Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
3047+
* is set when a recursion is detected at the current context, and if
3048+
* the TRANSITION bit is already set, it will fail the recursion.
3049+
* This is needed because there's a lag between the changing of
3050+
* interrupt context and updating the preempt count. In this case,
3051+
* a false positive will be found. To handle this, one extra recursion
3052+
* is allowed, and this is done by the TRANSITION bit. If the TRANSITION
3053+
* bit is already set, then it is considered a recursion and the function
3054+
* ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
3055+
*
3056+
* On the trace_recursive_unlock(), the TRANSITION bit will be the first
3057+
* to be cleared. Even if it wasn't the context that set it. That is,
3058+
* if an interrupt comes in while NORMAL bit is set and the ring buffer
3059+
* is called before preempt_count() is updated, since the check will
3060+
* be on the NORMAL bit, the TRANSITION bit will then be set. If an
3061+
* NMI then comes in, it will set the NMI bit, but when the NMI code
3062+
* does the trace_recursive_unlock() it will clear the TRANSTION bit
3063+
* and leave the NMI bit set. But this is fine, because the interrupt
3064+
* code that set the TRANSITION bit will then clear the NMI bit when it
3065+
* calls trace_recursive_unlock(). If another NMI comes in, it will
3066+
* set the TRANSITION bit and continue.
3067+
*
3068+
* Note: The TRANSITION bit only handles a single transition between context.
30433069
*/
30443070

30453071
static __always_inline int
@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
30553081
bit = pc & NMI_MASK ? RB_CTX_NMI :
30563082
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
30573083

3058-
if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
3059-
return 1;
3084+
if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
3085+
/*
3086+
* It is possible that this was called by transitioning
3087+
* between interrupt context, and preempt_count() has not
3088+
* been updated yet. In this case, use the TRANSITION bit.
3089+
*/
3090+
bit = RB_CTX_TRANSITION;
3091+
if (val & (1 << (bit + cpu_buffer->nest)))
3092+
return 1;
3093+
}
30603094

30613095
val |= (1 << (bit + cpu_buffer->nest));
30623096
cpu_buffer->current_context = val;
@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
30713105
cpu_buffer->current_context - (1 << cpu_buffer->nest);
30723106
}
30733107

3074-
/* The recursive locking above uses 4 bits */
3075-
#define NESTED_BITS 4
3108+
/* The recursive locking above uses 5 bits */
3109+
#define NESTED_BITS 5
30763110

30773111
/**
30783112
* ring_buffer_nest_start - Allow to trace while nested

kernel/trace/trace.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
27502750
/*
27512751
* If tracing is off, but we have triggers enabled
27522752
* we still need to look at the event data. Use the temp_buffer
2753-
* to store the trace event for the tigger to use. It's recusive
2753+
* to store the trace event for the trigger to use. It's recursive
27542754
* safe and will not be recorded anywhere.
27552755
*/
27562756
if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
@@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
29522952
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
29532953

29542954
/* This should never happen. If it does, yell once and skip */
2955-
if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
2955+
if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
29562956
goto out;
29572957

29582958
/*
@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)
31323132

31333133
/* Interrupts must see nesting incremented before we use the buffer */
31343134
barrier();
3135-
return &buffer->buffer[buffer->nesting][0];
3135+
return &buffer->buffer[buffer->nesting - 1][0];
31363136
}
31373137

31383138
static void put_trace_buf(void)

kernel/trace/trace.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,12 @@ enum {
637637
* function is called to clear it.
638638
*/
639639
TRACE_GRAPH_NOTRACE_BIT,
640+
641+
/*
642+
* When transitioning between context, the preempt_count() may
643+
* not be correct. Allow for a single recursion to cover this case.
644+
*/
645+
TRACE_TRANSITION_BIT,
640646
};
641647

642648
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
691697
return 0;
692698

693699
bit = trace_get_context_bit() + start;
694-
if (unlikely(val & (1 << bit)))
695-
return -1;
700+
if (unlikely(val & (1 << bit))) {
701+
/*
702+
* It could be that preempt_count has not been updated during
703+
* a switch between contexts. Allow for a single recursion.
704+
*/
705+
bit = TRACE_TRANSITION_BIT;
706+
if (trace_recursion_test(bit))
707+
return -1;
708+
trace_recursion_set(bit);
709+
barrier();
710+
return bit + 1;
711+
}
712+
713+
/* Normal check passed, clear the transition to allow it again */
714+
trace_recursion_clear(TRACE_TRANSITION_BIT);
696715

697716
val |= 1 << bit;
698717
current->trace_recursion = val;
699718
barrier();
700719

701-
return bit;
720+
return bit + 1;
702721
}
703722

704723
static __always_inline void trace_clear_recursion(int bit)
@@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit)
708727
if (!bit)
709728
return;
710729

730+
bit--;
711731
bit = 1 << bit;
712732
val &= ~bit;
713733

kernel/trace/trace_events_synth.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
584584
{
585585
struct synth_field *field;
586586
const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
587-
int len, ret = 0;
587+
int len, ret = -ENOMEM;
588588
struct seq_buf s;
589589
ssize_t size;
590590

@@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
617617
len--;
618618

619619
field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
620-
if (!field->name) {
621-
ret = -ENOMEM;
620+
if (!field->name)
622621
goto free;
623-
}
622+
624623
if (!is_good_name(field->name)) {
625624
synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));
626625
ret = -EINVAL;
@@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
638637
len += strlen(prefix);
639638

640639
field->type = kzalloc(len, GFP_KERNEL);
641-
if (!field->type) {
642-
ret = -ENOMEM;
640+
if (!field->type)
643641
goto free;
644-
}
642+
645643
seq_buf_init(&s, field->type, len);
646644
if (prefix)
647645
seq_buf_puts(&s, prefix);
@@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
653651
}
654652
if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
655653
goto free;
654+
656655
s.buffer[s.len] = '\0';
657656

658657
size = synth_field_size(field->type);
@@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
666665

667666
len = sizeof("__data_loc ") + strlen(field->type) + 1;
668667
type = kzalloc(len, GFP_KERNEL);
669-
if (!type) {
670-
ret = -ENOMEM;
668+
if (!type)
671669
goto free;
672-
}
673670

674671
seq_buf_init(&s, type, len);
675672
seq_buf_puts(&s, "__data_loc ");

kernel/trace/trace_selftest.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,8 +492,13 @@ trace_selftest_function_recursion(void)
492492
unregister_ftrace_function(&test_rec_probe);
493493

494494
ret = -1;
495-
if (trace_selftest_recursion_cnt != 1) {
496-
pr_cont("*callback not called once (%d)* ",
495+
/*
496+
* Recursion allows for transitions between context,
497+
* and may call the callback twice.
498+
*/
499+
if (trace_selftest_recursion_cnt != 1 &&
500+
trace_selftest_recursion_cnt != 2) {
501+
pr_cont("*callback not called once (or twice) (%d)* ",
497502
trace_selftest_recursion_cnt);
498503
goto out;
499504
}

0 commit comments

Comments
 (0)