@@ -13,9 +13,9 @@ modification, are permitted provided that the following conditions are
13
13
notice, this list of conditions and the following disclaimer in
14
14
the documentation and/or other materials provided with the
15
15
distribution.
16
- 3. Neither the name of the OpenBLAS project nor the names of
17
- its contributors may be used to endorse or promote products
18
- derived from this software without specific prior written
16
+ 3. Neither the name of the OpenBLAS project nor the names of
17
+ its contributors may be used to endorse or promote products
18
+ derived from this software without specific prior written
19
19
permission.
20
20
21
21
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -139,6 +139,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
139
139
#define FIXED_PAGESIZE 4096
140
140
#endif
141
141
142
+ #ifndef BUFFERS_PER_THREAD
143
+ #ifdef USE_OPENMP
144
+ #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
145
+ #else
146
+ #define BUFFERS_PER_THREAD NUM_BUFFERS
147
+ #endif
148
+ #endif
149
+
142
150
#define BITMASK (a , b , c ) ((((a) >> (b)) & (c)))
143
151
144
152
#if defined(_MSC_VER ) && !defined(__clang__ )
@@ -213,7 +221,7 @@ int i,n;
213
221
ret = sched_getaffinity (0 ,size ,cpusetp );
214
222
if (ret != 0 ) return nums ;
215
223
ret = CPU_COUNT_S (size ,cpusetp );
216
- if (ret > 0 && ret < nums ) nums = ret ;
224
+ if (ret > 0 && ret < nums ) nums = ret ;
217
225
CPU_FREE (cpusetp );
218
226
return nums ;
219
227
#endif
@@ -415,8 +423,15 @@ struct release_t {
415
423
416
424
int hugetlb_allocated = 0 ;
417
425
418
- static struct release_t release_info [NUM_BUFFERS ];
419
- static int release_pos = 0 ;
426
+ #if defined(OS_WINDOWS )
427
+ #define THREAD_LOCAL __declspec(thread)
428
+ #define UNLIKELY_TO_BE_ZERO (x ) (x)
429
+ #else
430
+ #define THREAD_LOCAL __thread
431
+ #define UNLIKELY_TO_BE_ZERO (x ) (__builtin_expect(x, 0))
432
+ #endif
433
+ static struct release_t THREAD_LOCAL release_info [BUFFERS_PER_THREAD ];
434
+ static int THREAD_LOCAL release_pos = 0 ;
420
435
421
436
#if defined(OS_LINUX ) && !defined(NO_WARMUP )
422
437
static int hot_alloc = 0 ;
@@ -459,15 +474,9 @@ static void *alloc_mmap(void *address){
459
474
}
460
475
461
476
if (map_address != (void * )-1 ) {
462
- #if defined(SMP ) && !defined(USE_OPENMP )
463
- LOCK_COMMAND (& alloc_lock );
464
- #endif
465
477
release_info [release_pos ].address = map_address ;
466
478
release_info [release_pos ].func = alloc_mmap_free ;
467
479
release_pos ++ ;
468
- #if defined(SMP ) && !defined(USE_OPENMP )
469
- UNLOCK_COMMAND (& alloc_lock );
470
- #endif
471
480
}
472
481
473
482
#ifdef OS_LINUX
@@ -611,15 +620,9 @@ static void *alloc_mmap(void *address){
611
620
#endif
612
621
613
622
if (map_address != (void * )-1 ) {
614
- #if defined(SMP ) && !defined(USE_OPENMP )
615
- LOCK_COMMAND (& alloc_lock );
616
- #endif
617
623
release_info [release_pos ].address = map_address ;
618
624
release_info [release_pos ].func = alloc_mmap_free ;
619
625
release_pos ++ ;
620
- #if defined(SMP ) && !defined(USE_OPENMP )
621
- UNLOCK_COMMAND (& alloc_lock );
622
- #endif
623
626
}
624
627
625
628
return map_address ;
@@ -872,7 +875,7 @@ static void *alloc_hugetlb(void *address){
872
875
873
876
tp .PrivilegeCount = 1 ;
874
877
tp .Privileges [0 ].Attributes = SE_PRIVILEGE_ENABLED ;
875
-
878
+
876
879
if (LookupPrivilegeValue (NULL , SE_LOCK_MEMORY_NAME , & tp .Privileges [0 ].Luid ) != TRUE) {
877
880
CloseHandle (hToken );
878
881
return (void * )-1 ;
@@ -961,20 +964,17 @@ static BLASULONG base_address = 0UL;
961
964
static BLASULONG base_address = BASE_ADDRESS ;
962
965
#endif
963
966
964
- static volatile struct {
965
- BLASULONG lock ;
967
+ struct memory_t {
966
968
void * addr ;
967
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
968
- int pos ;
969
- #endif
970
969
int used ;
971
970
#ifndef __64BIT__
972
971
char dummy [48 ];
973
972
#else
974
973
char dummy [40 ];
975
974
#endif
975
+ };
976
976
977
- } memory [NUM_BUFFERS ];
977
+ static struct memory_t THREAD_LOCAL memory [BUFFERS_PER_THREAD ];
978
978
979
979
static int memory_initialized = 0 ;
980
980
@@ -987,9 +987,6 @@ static int memory_initialized = 0;
987
987
void * blas_memory_alloc (int procpos ){
988
988
989
989
int position ;
990
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
991
- int mypos ;
992
- #endif
993
990
994
991
void * map_address ;
995
992
@@ -1020,102 +1017,48 @@ void *blas_memory_alloc(int procpos){
1020
1017
};
1021
1018
void * (* * func )(void * address );
1022
1019
1023
- #if defined(USE_OPENMP )
1024
- if (!memory_initialized ) {
1025
- #endif
1026
-
1027
- LOCK_COMMAND (& alloc_lock );
1020
+ if (UNLIKELY_TO_BE_ZERO (memory_initialized )) {
1028
1021
1029
- if (!memory_initialized ) {
1022
+ /* Only allow a single thread to initialize memory system */
1023
+ LOCK_COMMAND (& alloc_lock );
1030
1024
1031
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
1032
- for (position = 0 ; position < NUM_BUFFERS ; position ++ ){
1033
- memory [position ].addr = (void * )0 ;
1034
- memory [position ].pos = -1 ;
1035
- memory [position ].used = 0 ;
1036
- memory [position ].lock = 0 ;
1037
- }
1038
- #endif
1025
+ if (!memory_initialized ) {
1039
1026
1040
1027
#ifdef DYNAMIC_ARCH
1041
- gotoblas_dynamic_init ();
1028
+ gotoblas_dynamic_init ();
1042
1029
#endif
1043
1030
1044
1031
#if defined(SMP ) && defined(OS_LINUX ) && !defined(NO_AFFINITY )
1045
- gotoblas_affinity_init ();
1032
+ gotoblas_affinity_init ();
1046
1033
#endif
1047
1034
1048
1035
#ifdef SMP
1049
- if (!blas_num_threads ) blas_cpu_number = blas_get_cpu_number ();
1036
+ if (!blas_num_threads ) blas_cpu_number = blas_get_cpu_number ();
1050
1037
#endif
1051
1038
1052
1039
#if defined(ARCH_X86 ) || defined(ARCH_X86_64 ) || defined(ARCH_IA64 ) || defined(ARCH_MIPS64 ) || defined(ARCH_ARM64 )
1053
1040
#ifndef DYNAMIC_ARCH
1054
- blas_set_parameter ();
1041
+ blas_set_parameter ();
1055
1042
#endif
1056
1043
#endif
1057
1044
1058
- memory_initialized = 1 ;
1045
+ memory_initialized = 1 ;
1059
1046
1047
+ }
1048
+ UNLOCK_COMMAND (& alloc_lock );
1060
1049
}
1061
- UNLOCK_COMMAND (& alloc_lock );
1062
- #if defined(USE_OPENMP )
1063
- }
1064
- #endif
1065
1050
1066
1051
#ifdef DEBUG
1067
1052
printf ("Alloc Start ...\n" );
1068
- #endif
1069
-
1070
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
1071
-
1072
- mypos = WhereAmI ();
1073
-
1074
- position = mypos ;
1075
- while (position >= NUM_BUFFERS ) position >>= 1 ;
1076
-
1077
- do {
1078
- if (!memory [position ].used && (memory [position ].pos == mypos )) {
1079
- #if defined(SMP ) && !defined(USE_OPENMP )
1080
- LOCK_COMMAND (& alloc_lock );
1081
- #else
1082
- blas_lock (& memory [position ].lock );
1083
- #endif
1084
- if (!memory [position ].used ) goto allocation ;
1085
- #if defined(SMP ) && !defined(USE_OPENMP )
1086
- UNLOCK_COMMAND (& alloc_lock );
1087
- #else
1088
- blas_unlock (& memory [position ].lock );
1089
- #endif
1090
- }
1091
-
1092
- position ++ ;
1093
-
1094
- } while (position < NUM_BUFFERS );
1095
-
1096
-
1097
1053
#endif
1098
1054
1099
1055
position = 0 ;
1100
1056
1101
1057
do {
1102
- #if defined(SMP ) && !defined(USE_OPENMP )
1103
- LOCK_COMMAND (& alloc_lock );
1104
- #else
1105
- if (!memory [position ].used ) {
1106
- blas_lock (& memory [position ].lock );
1107
- #endif
1108
1058
if (!memory [position ].used ) goto allocation ;
1109
- #if defined(SMP ) && !defined(USE_OPENMP )
1110
- UNLOCK_COMMAND (& alloc_lock );
1111
- #else
1112
- blas_unlock (& memory [position ].lock );
1113
- }
1114
- #endif
1115
-
1116
1059
position ++ ;
1117
1060
1118
- } while (position < NUM_BUFFERS );
1061
+ } while (position < BUFFERS_PER_THREAD );
1119
1062
1120
1063
goto error ;
1121
1064
@@ -1126,11 +1069,6 @@ void *blas_memory_alloc(int procpos){
1126
1069
#endif
1127
1070
1128
1071
memory [position ].used = 1 ;
1129
- #if defined(SMP ) && !defined(USE_OPENMP )
1130
- UNLOCK_COMMAND (& alloc_lock );
1131
- #else
1132
- blas_unlock (& memory [position ].lock );
1133
- #endif
1134
1072
1135
1073
if (!memory [position ].addr ) {
1136
1074
do {
@@ -1148,14 +1086,14 @@ void *blas_memory_alloc(int procpos){
1148
1086
1149
1087
#ifdef ALLOC_DEVICEDRIVER
1150
1088
if ((* func == alloc_devicedirver ) && (map_address == (void * )-1 )) {
1151
- fprintf (stderr , "OpenBLAS Warning ... Physically contigous allocation was failed.\n" );
1089
+ fprintf (stderr , "OpenBLAS Warning ... Physically contiguous allocation failed.\n" );
1152
1090
}
1153
1091
#endif
1154
1092
1155
1093
#ifdef ALLOC_HUGETLBFILE
1156
1094
if ((* func == alloc_hugetlbfile ) && (map_address == (void * )-1 )) {
1157
1095
#ifndef OS_WINDOWS
1158
- fprintf (stderr , "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n" );
1096
+ fprintf (stderr , "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n" );
1159
1097
#endif
1160
1098
}
1161
1099
#endif
@@ -1176,44 +1114,13 @@ void *blas_memory_alloc(int procpos){
1176
1114
1177
1115
} while ((BLASLONG )map_address == -1 );
1178
1116
1179
- #if defined(SMP ) && !defined(USE_OPENMP )
1180
- LOCK_COMMAND (& alloc_lock );
1181
- #endif
1182
1117
memory [position ].addr = map_address ;
1183
- #if defined(SMP ) && !defined(USE_OPENMP )
1184
- UNLOCK_COMMAND (& alloc_lock );
1185
- #endif
1186
1118
1187
1119
#ifdef DEBUG
1188
1120
printf (" Mapping Succeeded. %p(%d)\n" , (void * )memory [position ].addr , position );
1189
1121
#endif
1190
1122
}
1191
1123
1192
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
1193
-
1194
- if (memory [position ].pos == -1 ) memory [position ].pos = mypos ;
1195
-
1196
- #endif
1197
-
1198
- #ifdef DYNAMIC_ARCH
1199
-
1200
- if (memory_initialized == 1 ) {
1201
-
1202
- LOCK_COMMAND (& alloc_lock );
1203
-
1204
- if (memory_initialized == 1 ) {
1205
-
1206
- if (!gotoblas ) gotoblas_dynamic_init ();
1207
-
1208
- memory_initialized = 2 ;
1209
- }
1210
-
1211
- UNLOCK_COMMAND (& alloc_lock );
1212
-
1213
- }
1214
- #endif
1215
-
1216
-
1217
1124
#ifdef DEBUG
1218
1125
printf ("Mapped : %p %3d\n\n" ,
1219
1126
(void * )memory [position ].addr , position );
@@ -1222,7 +1129,7 @@ void *blas_memory_alloc(int procpos){
1222
1129
return (void * )memory [position ].addr ;
1223
1130
1224
1131
error :
1225
- printf ("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n" );
1132
+ printf ("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n" );
1226
1133
1227
1134
return NULL ;
1228
1135
}
@@ -1236,10 +1143,7 @@ void blas_memory_free(void *free_area){
1236
1143
#endif
1237
1144
1238
1145
position = 0 ;
1239
- #if defined(SMP ) && !defined(USE_OPENMP )
1240
- LOCK_COMMAND (& alloc_lock );
1241
- #endif
1242
- while ((position < NUM_BUFFERS ) && (memory [position ].addr != free_area ))
1146
+ while ((position < BUFFERS_PER_THREAD ) && (memory [position ].addr != free_area ))
1243
1147
position ++ ;
1244
1148
1245
1149
if (memory [position ].addr != free_area ) goto error ;
@@ -1248,13 +1152,7 @@ void blas_memory_free(void *free_area){
1248
1152
printf (" Position : %d\n" , position );
1249
1153
#endif
1250
1154
1251
- // arm: ensure all writes are finished before other thread takes this memory
1252
- WMB ;
1253
-
1254
1155
memory [position ].used = 0 ;
1255
- #if defined(SMP ) && !defined(USE_OPENMP )
1256
- UNLOCK_COMMAND (& alloc_lock );
1257
- #endif
1258
1156
1259
1157
#ifdef DEBUG
1260
1158
printf ("Unmap Succeeded.\n\n" );
@@ -1266,11 +1164,8 @@ void blas_memory_free(void *free_area){
1266
1164
printf ("BLAS : Bad memory unallocation! : %4d %p\n" , position , free_area );
1267
1165
1268
1166
#ifdef DEBUG
1269
- for (position = 0 ; position < NUM_BUFFERS ; position ++ )
1167
+ for (position = 0 ; position < BUFFERS_PER_THREAD ; position ++ )
1270
1168
printf ("%4ld %p : %d\n" , position , memory [position ].addr , memory [position ].used );
1271
- #endif
1272
- #if defined(SMP ) && !defined(USE_OPENMP )
1273
- UNLOCK_COMMAND (& alloc_lock );
1274
1169
#endif
1275
1170
return ;
1276
1171
}
@@ -1293,8 +1188,6 @@ void blas_shutdown(void){
1293
1188
BLASFUNC (blas_thread_shutdown )();
1294
1189
#endif
1295
1190
1296
- LOCK_COMMAND (& alloc_lock );
1297
-
1298
1191
for (pos = 0 ; pos < release_pos ; pos ++ ) {
1299
1192
release_info [pos ].func (& release_info [pos ]);
1300
1193
}
@@ -1305,17 +1198,11 @@ void blas_shutdown(void){
1305
1198
base_address = BASE_ADDRESS ;
1306
1199
#endif
1307
1200
1308
- for (pos = 0 ; pos < NUM_BUFFERS ; pos ++ ){
1201
+ for (pos = 0 ; pos < BUFFERS_PER_THREAD ; pos ++ ){
1309
1202
memory [pos ].addr = (void * )0 ;
1310
1203
memory [pos ].used = 0 ;
1311
- #if defined(WHEREAMI ) && !defined(USE_OPENMP )
1312
- memory [pos ].pos = -1 ;
1313
- #endif
1314
- memory [pos ].lock = 0 ;
1315
1204
}
1316
1205
1317
- UNLOCK_COMMAND (& alloc_lock );
1318
-
1319
1206
return ;
1320
1207
}
1321
1208
0 commit comments