Skip to content

Commit fb5eb47

Browse files
authored
Merge pull request #2398 from xianyi/develop
Update from develop in preparation of the 0.3.8 release
2 parents 5f36f18 + fa93d63 commit fb5eb47

File tree

1,004 files changed

+60793
-14466
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,004 files changed

+60793
-14466
lines changed

.travis.yml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ matrix:
1717
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
1818
script:
1919
- set -e
20-
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
20+
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
2121
- make -C test $COMMON_FLAGS $BTYPE
2222
- make -C ctest $COMMON_FLAGS $BTYPE
2323
- make -C utest $COMMON_FLAGS $BTYPE
@@ -160,18 +160,25 @@ matrix:
160160
os: osx
161161
osx_image: xcode10.1
162162
before_script:
163-
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
163+
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
164164
- brew update
165-
- brew install gcc # for gfortran
165+
- brew install gcc@8 # for gfortran
166166
script:
167167
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
168168
env:
169-
- BTYPE="BINARY=64 INTERFACE64=1"
169+
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
170170

171171
- <<: *test-macos
172-
osx_image: xcode8.3
172+
osx_image: xcode10.0
173173
env:
174-
- BTYPE="BINARY=32"
174+
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
175+
176+
- <<: *test-macos
177+
osx_image: xcode10.1
178+
env:
179+
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
180+
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
181+
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
175182

176183
# whitelist
177184
branches:

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 7)
9+
set(OpenBLAS_PATCH_VERSION 8)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

CONTRIBUTORS.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,12 @@ In chronological order:
171171
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
172172
* [2019-03-14] power9 dgemm/dtrmm kernel
173173
* [2019-04-29] power9 sgemm/strmm kernel
174+
175+
* Jiachen Wang <https://github.com/wjc404>
176+
* [2019-07-29] optimize AVX2 DGEMM
177+
* [2019-10-20] AVX512 DGEMM kernel (4x8)
178+
* [2019-11-06] optimize AVX512 SGEMM
179+
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
180+
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
181+
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
182+
* [2020-01-07] optimize AVX2 SGEMM and STRMM

Changelog.txt

Lines changed: 89 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,100 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.8
4+
9-Feb-2020
5+
6+
common:
7+
` * LAPACK has been updated to 3.9.0 (plus patches up to
8+
January 2nd, 2020)
9+
* CMAKE support has been improved in several areas including
10+
cross-compilation
11+
* a thread race condition in the GEMM3M kernels was resolved
12+
* the "generic" (plain C) gemm beta kernel used by many targets
13+
has been sped up
14+
* an optimized version of the LAPACK trtrs functions has been added
15+
* an incompatibilty between the LAPACK tests and the OpenBLAS
16+
implementation of XERBLA was resolved, removing the numerous
17+
warnings about wrong error exits in the former
18+
* support for NetBSD has been added
19+
* support for compilation with g95 and non-GNU versions of ld
20+
has been improved
21+
* support for compilation with (upcoming) gcc 10 has been added
22+
23+
POWER:
24+
* worked around miscompilation of several POWER8 and POWER9
25+
kernels by older versions of gcc
26+
* added support for big-endian POWER8 and for compilation on AIX
27+
* corrected bugs in the big-endian support for PPC440 and PPC970
28+
* DYNAMIC_ARCH support is now available in CMAKE builds as well
29+
30+
ARMV8:
31+
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
32+
* compilation for 32bit works again
33+
* performance of the RPCC function has been improved
34+
* improved performance on small systems
35+
* DYNAMIC_ARCH support is now available in CMAKE builds as well
36+
* cross-compilation from OSX to IOS was simplified
37+
38+
x86_64:
39+
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
40+
was significantly improved
41+
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
42+
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
43+
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
44+
* added support for QEMU virtual cpus
45+
* a compilation problem with PGI and SUN compilers was fixed
46+
* Intel "Goldmont plus" is now autodetected
47+
* a potential crash on program exit on MS Windows has been fixed
48+
49+
x86:
50+
* an unwanted case sensitivity in the implementation of LSAME
51+
on older 32bit AMD cpus was fixed
52+
53+
zarch:
54+
* Z15 is now supported as Z14
55+
* DYNAMIC_ARCH is now available on ZARCH as well
56+
257
====================================================================
358
Version 0.3.7
459
11-Aug 2019
560

661
common:
7-
* having the gmake special variables TARGET_ARCH or TARGET_MACH
8-
defined no longer causes build failures in ctest or utest
9-
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
10-
has the same effect as setting them to 1
11-
* a new test program was added to allow checking the library for
12-
thread safety
13-
* a new option USE_LOCKING was added to ensure thread safety when
14-
OpenBLAS itself is built without multithreading but will be
15-
called from multiple threads.
16-
* a build failure on Linux with glibc versions earlier than 2.5
17-
was fixed
18-
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
19-
on glibc 2.6 was fixed
20-
* NO_AFFINITY was added to the CMAKE options (and defaults to being
21-
active on Linux, as in the gmake builds)
62+
* having the gmake special variables TARGET_ARCH or TARGET_MACH
63+
defined no longer causes build failures in ctest or utest
64+
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
65+
has the same effect as setting them to 1
66+
* a new test program was added to allow checking the library for
67+
thread safety
68+
* a new option USE_LOCKING was added to ensure thread safety when
69+
OpenBLAS itself is built without multithreading but will be
70+
called from multiple threads.
71+
* a build failure on Linux with glibc versions earlier than 2.5
72+
was fixed
73+
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
74+
on glibc 2.6 was fixed
75+
* NO_AFFINITY was added to the CMAKE options (and defaults to being
76+
active on Linux, as in the gmake builds)
2277

2378
x86_64:
24-
* the build-time logic for detection of AVX512 availability in
25-
the processor and compiler was fixed
26-
* gmake builds on OSX now set the internal name of the library to
27-
libopenblas.0.dylib (consistent with CMAKE)
28-
* the Haswell DGEMM kernel received a significant speedup through
29-
improved prefetch and load instructions
30-
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
31-
increased by avoiding vpermpd instructions
32-
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
33-
to fix remaining errors in DGEMM, DSYMM and DTRMM
34-
35-
## POWER:
36-
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
37-
* added optimized kernels for POWER9 single and double precision complex BLAS3
38-
* added optimized kernels for POWER9 SGEMM and STRMM
39-
40-
## ARMV7:
41-
* fixed the softfp implementations of xAMAX and IxAMAX
42-
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
43-
they were appropriate for only a subset of platforms
79+
* the build-time logic for detection of AVX512 availability in
80+
the processor and compiler was fixed
81+
* gmake builds on OSX now set the internal name of the library to
82+
libopenblas.0.dylib (consistent with CMAKE)
83+
* the Haswell DGEMM kernel received a significant speedup through
84+
improved prefetch and load instructions
85+
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
86+
increased by avoiding vpermpd instructions
87+
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
88+
to fix remaining errors in DGEMM, DSYMM and DTRMM
89+
90+
POWER:
91+
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
92+
* added optimized kernels for POWER9 SGEMM and STRMM
93+
94+
ARMV7:
95+
* fixed the softfp implementations of xAMAX and IxAMAX
96+
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
97+
they were appropriate for only a subset of platforms
4498

4599
====================================================================
46100
Version 0.3.6

Makefile

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild
247247

248248
lapack_prebuild :
249249
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
250-
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
251-
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
250+
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
251+
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
252252
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
253-
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
253+
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
254254
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
255-
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
255+
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
256256
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
257257
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
258-
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
259-
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
258+
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
259+
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
260260
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
261-
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
262-
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
261+
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
262+
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
263263
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
264-
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
264+
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
265265
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
266266
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
267267
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -319,7 +319,7 @@ lapack-test :
319319
ifneq ($(CROSS), 1)
320320
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
321321
./testsecond; ./testdsecnd; ./testieee; ./testversion )
322-
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
322+
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
323323
endif
324324

325325
lapack-runtest:

Makefile.arm64

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
3939
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
4040
endif
4141

42+
ifeq ($(GCCVERSIONGTEQ9), 1)
4243
ifeq ($(CORE), TSV110)
4344
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
4445
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
4546
endif
47+
endif
48+

Makefile.install

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ endif
5151
ifneq ($(OSNAME), AIX)
5252
ifndef NO_LAPACKE
5353
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
54+
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
5455
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
5556
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
5657
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@@ -100,6 +101,7 @@ else
100101
#install on AIX has different options syntax
101102
ifndef NO_LAPACKE
102103
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
104+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
103105
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
104106
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
105107
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"

Makefile.rule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.7
6+
VERSION = 0.3.8
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

Makefile.system

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ ifndef TOPDIR
99
TOPDIR = .
1010
endif
1111

12-
# If ARCH is not set, we use the host system's architecture.
12+
# If ARCH is not set, we use the host system's architecture for getarch compile options.
1313
ifndef ARCH
14-
ARCH := $(shell uname -m)
14+
HOSTARCH := $(shell uname -m)
15+
else
16+
HOSTARCH = $(ARCH)
1517
endif
1618

1719
# Catch conflicting usage of ARCH in some BSD environments
@@ -23,6 +25,8 @@ else ifeq ($(ARCH), i386)
2325
override ARCH=x86
2426
else ifeq ($(ARCH), aarch64)
2527
override ARCH=arm64
28+
else ifeq ($(ARCH), zarch)
29+
override ARCH=zarch
2630
endif
2731

2832
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -142,9 +146,9 @@ endif
142146
endif
143147

144148

145-
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
146-
ifeq ($(ARCH), x86_64)
147-
ifneq ($(C_COMPILER), PGI)
149+
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
150+
ifeq ($(HOSTARCH), x86_64)
151+
ifeq ($(findstring pgcc,$(HOSTCC)),)
148152
GETARCH_FLAGS += -march=native
149153
endif
150154
endif
@@ -320,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
320324
endif
321325

322326
ifeq ($(C_COMPILER), GCC)
323-
#Test for supporting MS_ABI
327+
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
324328
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
325329
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
330+
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
331+
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
326332
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
327333
ifeq ($(GCCVERSIONGT4), 1)
328-
# GCC Majar version > 4
334+
# GCC Major version > 4
329335
# It is compatible with MSVC ABI.
330336
CCOMMON_OPT += -DMS_ABI
331337
endif
@@ -544,15 +550,34 @@ endif
544550

545551
ifeq ($(ARCH), arm64)
546552
DYNAMIC_CORE = ARMV8
553+
DYNAMIC_CORE += CORTEXA53
547554
DYNAMIC_CORE += CORTEXA57
555+
DYNAMIC_CORE += CORTEXA72
556+
DYNAMIC_CORE += CORTEXA73
557+
DYNAMIC_CORE += FALKOR
548558
DYNAMIC_CORE += THUNDERX
549559
DYNAMIC_CORE += THUNDERX2T99
560+
DYNAMIC_CORE += TSV110
561+
endif
562+
563+
ifeq ($(ARCH), zarch)
564+
DYNAMIC_CORE = Z13
565+
DYNAMIC_CORE += Z14
550566
endif
551567

552568
ifeq ($(ARCH), power)
553569
DYNAMIC_CORE = POWER6
554570
DYNAMIC_CORE += POWER8
571+
ifneq ($(C_COMPILER), GCC)
572+
DYNAMIC_CORE += POWER9
573+
endif
574+
ifeq ($(C_COMPILER), GCC)
575+
ifeq ($(GCCVERSIONGT5), 1)
555576
DYNAMIC_CORE += POWER9
577+
else
578+
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
579+
endif
580+
endif
556581
endif
557582

558583
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -697,7 +722,7 @@ endif
697722

698723
ifeq ($(C_COMPILER), PGI)
699724
ifdef BINARY64
700-
CCOMMON_OPT += -tp p7-64
725+
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
701726
else
702727
CCOMMON_OPT += -tp p7
703728
endif
@@ -757,6 +782,9 @@ else
757782
FCOMMON_OPT += -m32
758783
endif
759784
endif
785+
ifneq ($(NO_LAPACKE), 1)
786+
FCOMMON_OPT += -fno-second-underscore
787+
endif
760788
endif
761789
endif
762790

0 commit comments

Comments
 (0)