18
18
#define WARMUP 5
19
19
#define ITERATIONS 10
20
20
21
- #define L1_SIZE 32 *100
22
- #define L2_SIZE 32 *2000
23
- #define L3_SIZE 32 *20000
24
- #define MEM_SIZE 32 *2000000
21
+ #define L1_SIZE 32 *128
22
+ #define L2_SIZE 32 *2048
23
+ #define L3_SIZE 32 *20480
24
+ #define MEM_SIZE 32 *2048000
25
25
26
26
struct quantize_perf_params {
27
27
std::vector<std::string> include_types;
@@ -36,7 +36,7 @@ struct quantize_perf_params {
36
36
37
37
#if defined(__x86_64__) || defined(__i386__)
38
38
39
- #include < immintrin .h>
39
+ #include < x86intrin .h>
40
40
inline int64_t cpu_cycles () {
41
41
// Rough way to detect new-ish CPUs
42
42
#ifdef __POPCNT__
@@ -71,29 +71,25 @@ void * align_with_offset(void * ptr, int offset) {
71
71
}
72
72
73
73
void benchmark_function (size_t size, size_t q_size, std::function<size_t (void )> function) {
74
-
75
- size_t bytes_out = 0 ;
76
-
77
74
int64_t min_time_us = INT64_MAX;
78
75
int64_t total_time_us = 0 ;
79
76
int64_t min_time_cycles = INT64_MAX;
80
77
int64_t total_time_cycles = 0 ;
81
78
82
79
for (int i = 0 ; i < WARMUP; i++) {
83
- bytes_out |= function ();
80
+ function ();
84
81
}
85
82
86
83
87
84
for (int i = 0 ; i < ITERATIONS; i++) {
88
85
const int64_t start_time = ggml_time_us ();
89
86
const int64_t start_cycles = cpu_cycles ();
90
87
91
- bytes_out |= function ();
88
+ function ();
92
89
93
90
const int64_t end_cycles = cpu_cycles ();
94
91
const int64_t end_time = ggml_time_us ();
95
92
96
- // printf(" aostne %d\n", end_cycles - start_cycles);
97
93
total_time_cycles += end_cycles - start_cycles;
98
94
min_time_cycles = std::min (min_time_cycles, end_cycles - start_cycles);
99
95
total_time_us += end_time - start_time;
0 commit comments