Skip to content

Commit 12c08a9

Browse files
seranianacmel
authored andcommitted
perf stat: Add per-core aggregation
This patch adds the --per-core option to perf stat. This option is used to aggregate system-wide counts on a per physical core basis. On processors with hyperthreading, this means counts of all HT threads running on a physical core are aggregated. This mode is useful to find imblance between physical cores running an uniform workload. Cores are identified by socket: S0-C1, means physical core 1 on socket 0. Note that cores are identified using their physical core id, thus their numbering may not be continuous. Per core aggregation can be combined with interval printing: # perf stat -a --per-core -I 1000 -e cycles sleep 1000 # time core cpus counts events 1.000090030 S0-C0 1 4,765,747 cycles 1.000090030 S0-C1 1 5,580,647 cycles 1.000090030 S0-C2 1 221,181 cycles 1.000090030 S0-C3 1 266,092 cycles Signed-off-by: Stephane Eranian <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Link: http://lkml.kernel.org/r/[email protected] [ committer note: Remove parts already applied on 86ee6e1 to keep bisectability ] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent d430495 commit 12c08a9

File tree

4 files changed

+92
-3
lines changed

4 files changed

+92
-3
lines changed

tools/perf/Documentation/perf-stat.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,12 @@ use --per-socket in addition to -a. (system-wide). The output includes the
126126
socket number and the number of online processors on that socket. This is
127127
useful to gauge the amount of aggregation.
128128

129+
--per-core::
130+
Aggregate counts per physical processor for system-wide mode measurements. This
131+
is a useful mode to detect imbalance between physical cores. To enable this mode,
132+
use --per-core in addition to -a. (system-wide). The output includes the
133+
core number and the number of online logical processors on that physical processor.
134+
129135
EXAMPLES
130136
--------
131137

tools/perf/builtin-stat.c

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ enum aggr_mode {
8080
AGGR_NONE,
8181
AGGR_GLOBAL,
8282
AGGR_SOCKET,
83+
AGGR_CORE,
8384
};
8485

8586
static int run_count = 1;
@@ -384,6 +385,9 @@ static void print_interval(void)
384385
case AGGR_SOCKET:
385386
fprintf(output, "# time socket cpus counts events\n");
386387
break;
388+
case AGGR_CORE:
389+
fprintf(output, "# time core cpus counts events\n");
390+
break;
387391
case AGGR_NONE:
388392
fprintf(output, "# time CPU counts events\n");
389393
break;
@@ -397,6 +401,7 @@ static void print_interval(void)
397401
num_print_interval = 0;
398402

399403
switch (aggr_mode) {
404+
case AGGR_CORE:
400405
case AGGR_SOCKET:
401406
print_aggr(prefix);
402407
break;
@@ -566,13 +571,23 @@ static void print_noise(struct perf_evsel *evsel, double avg)
566571
print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
567572
}
568573

569-
static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
574+
static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
570575
{
571576
switch (aggr_mode) {
577+
case AGGR_CORE:
578+
fprintf(output, "S%d-C%*d%s%*d%s",
579+
cpu_map__id_to_socket(id),
580+
csv_output ? 0 : -8,
581+
cpu_map__id_to_cpu(id),
582+
csv_sep,
583+
csv_output ? 0 : 4,
584+
nr,
585+
csv_sep);
586+
break;
572587
case AGGR_SOCKET:
573588
fprintf(output, "S%*d%s%*d%s",
574589
csv_output ? 0 : -5,
575-
cpu,
590+
id,
576591
csv_sep,
577592
csv_output ? 0 : 4,
578593
nr,
@@ -581,7 +596,7 @@ static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
581596
case AGGR_NONE:
582597
fprintf(output, "CPU%*d%s",
583598
csv_output ? 0 : -4,
584-
perf_evsel__cpus(evsel)->map[cpu], csv_sep);
599+
perf_evsel__cpus(evsel)->map[id], csv_sep);
585600
break;
586601
case AGGR_GLOBAL:
587602
default:
@@ -1095,6 +1110,7 @@ static void print_stat(int argc, const char **argv)
10951110
}
10961111

10971112
switch (aggr_mode) {
1113+
case AGGR_CORE:
10981114
case AGGR_SOCKET:
10991115
print_aggr(NULL);
11001116
break;
@@ -1163,6 +1179,13 @@ static int perf_stat_init_aggr_mode(void)
11631179
}
11641180
aggr_get_id = cpu_map__get_socket;
11651181
break;
1182+
case AGGR_CORE:
1183+
if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
1184+
perror("cannot build core map");
1185+
return -1;
1186+
}
1187+
aggr_get_id = cpu_map__get_core;
1188+
break;
11661189
case AGGR_NONE:
11671190
case AGGR_GLOBAL:
11681191
default:
@@ -1372,6 +1395,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
13721395
"print counts at regular interval in ms (>= 100)"),
13731396
OPT_SET_UINT(0, "per-socket", &aggr_mode,
13741397
"aggregate counts per processor socket", AGGR_SOCKET),
1398+
OPT_SET_UINT(0, "per-core", &aggr_mode,
1399+
"aggregate counts per physical processor core", AGGR_CORE),
13751400
OPT_END()
13761401
};
13771402
const char * const stat_usage[] = {

tools/perf/util/cpumap.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,53 @@ static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
267267
return 0;
268268
}
269269

270+
int cpu_map__get_core(struct cpu_map *map, int idx)
271+
{
272+
FILE *fp;
273+
const char *mnt;
274+
char path[PATH_MAX];
275+
int cpu, ret, s;
276+
277+
if (idx > map->nr)
278+
return -1;
279+
280+
cpu = map->map[idx];
281+
282+
mnt = sysfs_find_mountpoint();
283+
if (!mnt)
284+
return -1;
285+
286+
snprintf(path, PATH_MAX,
287+
"%s/devices/system/cpu/cpu%d/topology/core_id",
288+
mnt, cpu);
289+
290+
fp = fopen(path, "r");
291+
if (!fp)
292+
return -1;
293+
ret = fscanf(fp, "%d", &cpu);
294+
fclose(fp);
295+
if (ret != 1)
296+
return -1;
297+
298+
s = cpu_map__get_socket(map, idx);
299+
if (s == -1)
300+
return -1;
301+
302+
/*
303+
* encode socket in upper 16 bits
304+
* core_id is relative to socket, and
305+
* we need a global id. So we combine
306+
* socket+ core id
307+
*/
308+
return (s << 16) | (cpu & 0xffff);
309+
}
310+
270311
int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
271312
{
272313
return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
273314
}
315+
316+
int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep)
317+
{
318+
return cpu_map__build_map(cpus, corep, cpu_map__get_core);
319+
}

tools/perf/util/cpumap.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ void cpu_map__delete(struct cpu_map *map);
1515
struct cpu_map *cpu_map__read(FILE *file);
1616
size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
1717
int cpu_map__get_socket(struct cpu_map *map, int idx);
18+
int cpu_map__get_core(struct cpu_map *map, int idx);
1819
int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp);
20+
int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep);
1921

2022
static inline int cpu_map__socket(struct cpu_map *sock, int s)
2123
{
@@ -24,6 +26,16 @@ static inline int cpu_map__socket(struct cpu_map *sock, int s)
2426
return sock->map[s];
2527
}
2628

29+
static inline int cpu_map__id_to_socket(int id)
30+
{
31+
return id >> 16;
32+
}
33+
34+
static inline int cpu_map__id_to_cpu(int id)
35+
{
36+
return id & 0xffff;
37+
}
38+
2739
static inline int cpu_map__nr(const struct cpu_map *map)
2840
{
2941
return map ? map->nr : 1;

0 commit comments

Comments
 (0)