Skip to content

Commit e8a68ef

Browse files
authored
Merge pull request #1702 from xianyi/develop
Merge develop for 0.3.2
2 parents c6aec89 + 64826a0 commit e8a68ef

File tree

14 files changed

+268
-44
lines changed

14 files changed

+268
-44
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 1)
9+
set(OpenBLAS_PATCH_VERSION 2)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Makefile

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
2121
RELA = re_lapack
2222
endif
2323

24+
ifeq ($(NO_FORTRAN), 1)
25+
define NOFORTRAN
26+
1
27+
endef
28+
define NO_LAPACK
29+
1
30+
endef
31+
export NOFORTRAN
32+
export NO_LAPACK
33+
endif
34+
2435
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
2536

2637
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
@@ -47,7 +58,7 @@ endif
4758
endif
4859

4960
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
50-
ifndef NOFORTRAN
61+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
5162
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
5263
endif
5364
ifneq ($(OSNAME), AIX)
@@ -108,7 +119,7 @@ endif
108119
endif
109120

110121
tests :
111-
ifndef NOFORTRAN
122+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
112123
touch $(LIBNAME)
113124
ifndef NO_FBLAS
114125
$(MAKE) -C test all
@@ -210,7 +221,7 @@ netlib :
210221

211222
else
212223
netlib : lapack_prebuild
213-
ifndef NOFORTRAN
224+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
214225
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
215226
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
216227
endif
@@ -231,7 +242,7 @@ prof_lapack : lapack_prebuild
231242
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
232243

233244
lapack_prebuild :
234-
ifndef NOFORTRAN
245+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
235246
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
236247
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
237248
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -274,21 +285,21 @@ endif
274285
endif
275286

276287
large.tgz :
277-
ifndef NOFORTRAN
288+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
278289
if [ ! -a $< ]; then
279290
-wget http://www.netlib.org/lapack/timing/large.tgz;
280291
fi
281292
endif
282293

283294
timing.tgz :
284-
ifndef NOFORTRAN
295+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
285296
if [ ! -a $< ]; then
286297
-wget http://www.netlib.org/lapack/timing/timing.tgz;
287298
fi
288299
endif
289300

290301
lapack-timing : large.tgz timing.tgz
291-
ifndef NOFORTRAN
302+
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
292303
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
293304
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
294305
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.1
6+
VERSION = 0.3.2
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

cmake/prebuild.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
8585
endif ()
8686

8787
# Cannot run getarch on target if we are cross-compiling
88-
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
88+
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
8989
# Write to config as getarch would
9090

9191
# TODO: Set up defines that getarch sets up based on every other target

cmake/system_check.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ endif()
6868

6969
if (X86_64 OR X86)
7070
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
71-
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
71+
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
7272
if (NO_AVX512 EQUAL 1)
7373
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
7474
endif()

cpuid_power.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,52 @@ int detect(void){
142142

143143
return CPUTYPE_PPC970;
144144
#endif
145+
146+
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
147+
int id;
148+
id = __asm __volatile("mfpvr %0" : "=r"(id));
149+
switch ( id >> 16 ) {
150+
case 0x4e: // POWER9
151+
return return CPUTYPE_POWER8;
152+
break;
153+
case 0x4d:
154+
case 0x4b: // POWER8/8E
155+
return CPUTYPE_POWER8;
156+
break;
157+
case 0x4a:
158+
case 0x3f: // POWER7/7E
159+
return CPUTYPE_POWER6;
160+
break;
161+
case 0x3e:
162+
return CPUTYPE_POWER6;
163+
break;
164+
case 0x3a:
165+
return CPUTYPE_POWER5;
166+
break;
167+
case 0x35:
168+
case 0x38: // POWER4 /4+
169+
return CPUTYPE_POWER4;
170+
break;
171+
case 0x40:
172+
case 0x41: // POWER3 /3+
173+
return CPUTYPE_POWER3;
174+
break;
175+
case 0x39:
176+
case 0x3c:
177+
case 0x44:
178+
case 0x45:
179+
return CPUTYPE_PPC970;
180+
break;
181+
case 0x70:
182+
return CPUTYPE_CELL;
183+
break;
184+
case 0x8003:
185+
return CPUTYPE_PPCG4;
186+
break;
187+
default:
188+
return CPUTYPE_UNKNOWN;
189+
}
190+
#endif
145191
}
146192

147193
void get_architecture(void){

cpuid_x86.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,6 +1452,8 @@ int get_cpuname(void){
14521452
switch (model) {
14531453
case 1:
14541454
// AMD Ryzen
1455+
case 8:
1456+
// AMD Ryzen2
14551457
if(support_avx())
14561458
#ifndef NO_AVX2
14571459
return CPUTYPE_ZEN;

driver/level3/level3_thread.c

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
344344
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
345345
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {
346346

347+
/* Make sure if no one is using workspace */
348+
START_RPCC();
349+
for (i = 0; i < args -> nthreads; i++)
350+
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
351+
STOP_RPCC(waiting1);
352+
347353
#if defined(FUSED_GEMM) && !defined(TIMING)
348354

349355
/* Fused operation to copy region of B into workspace and apply kernel */
@@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
381387
}
382388
#endif
383389

384-
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
385-
/* Make sure if no one is using workspace */
386-
START_RPCC();
387-
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
388-
STOP_RPCC(waiting1);
389-
/* Set flag so other threads can access local region of B */
390+
/* Set flag so other threads can access local region of B */
391+
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
390392
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
391-
WMB;
392-
}
393+
WMB;
393394
}
394395

395396
/* Get regions of B from other threads and apply kernel */
@@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
425426

426427
/* Clear synchronization flag if this thread is done with other region of B */
427428
if (m_to - m_from == min_i) {
428-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
429+
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
429430
WMB;
430431
}
431432
}
432433
} while (current != mypos);
433434

434-
/* Iterate through steps of m
435+
/* Iterate through steps of m
435436
* Note: First step has already been finished */
436437
for(is = m_from + min_i; is < m_to; is += min_i){
437438
min_i = m_to - is;
@@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
461462
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
462463
c, ldc, is, js);
463464
STOP_RPCC(kernel);
464-
465+
465466
#ifdef TIMING
466467
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
467468
#endif
468-
469+
469470
/* Clear synchronization flag if this thread is done with region of B */
470471
if (is + min_i >= m_to) {
471-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
472+
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
472473
WMB;
473474
}
474475
}

driver/others/dynamic.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){
510510
#ifndef NO_AVX2
511511
return &gotoblas_HASWELL;
512512
#else
513-
return &gotblas_SANDYBRIDGE;
513+
return &gotoblas_SANDYBRIDGE;
514514
#endif
515515
else
516516
return &gotoblas_NEHALEM;
@@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){
607607
}
608608
}
609609
} else if (exfamily == 8) {
610-
if (model == 1) {
610+
if (model == 1 || model == 8) {
611611
if(support_avx())
612612
return &gotoblas_ZEN;
613613
else{

driver/others/memory.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
140140
#endif
141141

142142
#ifndef BUFFERS_PER_THREAD
143-
#ifdef USE_OPENMP
143+
#ifdef USE_OPENMP_UNUSED
144144
#define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
145145
#else
146146
#define BUFFERS_PER_THREAD NUM_BUFFERS
@@ -363,7 +363,7 @@ int blas_get_cpu_number(void){
363363
#endif
364364

365365
// blas_goto_num = 0;
366-
#ifndef USE_OPENMP
366+
#ifndef USE_OPENMP_UNUSED
367367
blas_goto_num=openblas_num_threads_env();
368368
if (blas_goto_num < 0) blas_goto_num = 0;
369369

@@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
494494
#endif
495495

496496
/* Holds pointers to allocated memory */
497-
#if defined(SMP) && !defined(USE_OPENMP)
497+
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
498498
/* This is the number of threads than can be spawned by the server, which is the
499499
server plus the number of threads in the thread pool */
500-
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1
500+
# define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2
501501
static int next_memory_table_pos = 0;
502502
# if defined(HAS_COMPILER_TLS)
503503
/* Use compiler generated thread-local-storage */
@@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL;
532532

533533
/* Returns a pointer to the start of the per-thread memory allocation data */
534534
static __inline struct alloc_t ** get_memory_table() {
535-
#if defined(SMP) && !defined(USE_OPENMP)
535+
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
536536
# if !defined(HAS_COMPILER_TLS)
537537
# if defined(OS_WINDOWS)
538538
int local_memory_table_pos = (int)::TlsGetValue(local_storage_key);
@@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0;
10571057
/* 2 : Thread */
10581058

10591059
static void blas_memory_init(){
1060-
#if defined(SMP) && !defined(USE_OPENMP)
1060+
#if defined(SMP) && !defined(USE_OPENMP_UNUSED)
10611061
next_memory_table_pos = 0;
10621062
# if !defined(HAS_COMPILER_TLS)
10631063
# if defined(OS_WINDOWS)
@@ -1279,7 +1279,7 @@ void blas_shutdown(void){
12791279
struct alloc_t *alloc_info = local_memory_table[thread][pos];
12801280
if (alloc_info) {
12811281
alloc_info->release_func(alloc_info);
1282-
alloc_info = (void *)0;
1282+
local_memory_table[thread][pos] = (void *)0;
12831283
}
12841284
}
12851285
}

driver/others/openblas_get_config.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535

3636
#include <string.h>
3737

38+
#if defined(_WIN32) && defined(_MSC_VER)
39+
#if _MSC_VER < 1900
40+
#define snprintf _snprintf
41+
#endif
42+
#endif
43+
3844
static char* openblas_config_str=""
3945
#ifdef USE64BITINT
4046
"USE64BITINT "

kernel/mips64/KERNEL

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
CAXPYKERNEL = ../mips/zaxpy.c
2+
ZAXPYKERNEL = ../mips/zaxpy.c
3+
SROTKERNEL = ../mips/rot.c
4+
DROTKERNEL = ../mips/rot.c
5+
CROTKERNEL = ../mips/zrot.c
6+
ZROTKERNEL = ../mips/zrot.c
7+
CSWAPKERNEL = ../mips/zswap.c
8+
ZSWAPKERNEL = ../mips/zswap.c
9+
110
ifndef SNRM2KERNEL
211
SNRM2KERNEL = snrm2.S
312
endif

0 commit comments

Comments
 (0)