From 6a99fcce94ac0ff76bcdc502197fb8e0cd1c97c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Mar 2018 00:03:49 +0100 Subject: [PATCH 01/61] Use _Atomic instead of volatile for thread safety where C11 is supported Suggested by dodomorandi in #660 --- driver/level3/level3_gemm3m_thread.c | 7 ++++++- driver/level3/level3_syrk_threaded.c | 7 ++++++- driver/level3/level3_thread.c | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index bfd991ffb6..f5e5bca1e1 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 65002ae46c..d1c476f003 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index a1ed8bbb1f..47b20f7fa6 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,7 +91,12 @@ #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L +_Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; From 40160ff3c1d2427b312f9894795e89f21a48eca0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Mar 2018 00:15:44 +0100 Subject: [PATCH 02/61] Use _Atomic instead of volatile for thread safety where C11 is supported --- lapack/getrf/getrf_parallel.c | 32 +- lapack/getrf/potrf_parallel.c | 664 ++++++++++++++++++++++++++++++++++ 2 files changed, 690 insertions(+), 6 deletions(-) create mode 100644 lapack/getrf/potrf_parallel.c diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index db8c836e0a..27faea0cd9 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -99,7 +99,11 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; +#if _STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; +#endif blasint *ipiv = (blasint *)args -> c; @@ -177,7 +181,12 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -216,9 +225,11 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * FLOAT *sbb= sb; blasint *ipiv = (blasint *)args -> c; - +#if _STDC_VERSION__ >= 201112L + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; +#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; - +#endif if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -378,7 +389,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX @@ -634,8 +650,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; - - volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c new file mode 100644 index 0000000000..104022dd9d --- /dev/null +++ b/lapack/getrf/potrf_parallel.c @@ -0,0 +1,664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +//The array of job_t may overflow the stack. +//Instead, use malloc to alloc job_t. +#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD +#define USE_ALLOC_HEAP +#endif + + +static FLOAT dm1 = -1.; + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +#ifndef LOWER +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#endif +#endif + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef LOWER +#define TRANS +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { +#if _STDC_VERSION__ >= 201112L + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef S +#define S args -> a +#endif +#ifndef A +#define A args -> b +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef N +#define N args -> m +#endif +#ifndef K +#define K args -> k +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda; + BLASLONG m_from, m_to; + + FLOAT *alpha; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + + alpha = (FLOAT *)args -> alpha; + + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); +#endif + + div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + +#ifndef LOWER + TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#else + TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#endif + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + +#ifndef LOWER + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; +#else + if (min_jj > GEMM_P) min_jj = GEMM_P; +#endif + +#ifndef LOWER + OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (k, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb, + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + a + jjs * lda * COMPSIZE, lda, 0); +#else + ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (min_jj, k, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + sb, + a + jjs * COMPSIZE, lda, 0); +#endif + } + +#ifndef LOWER + for (i = 0; i <= mypos; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#else + for (i = mypos; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#endif + + WMB; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + /* thread has to wait */ + if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, m_from, xxx); + + if (m_from + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } +#ifndef LOWER + current ++; +#else + current --; +#endif + } + } + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + return 0; + } + +static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ + + blas_arg_t newarg; + +#ifndef USE_ALLOC_HEAP + job_t job[MAX_CPU_NUMBER]; +#else + job_t * job = NULL; +#endif + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.alpha = args -> alpha; + +#ifdef USE_ALLOC_HEAP + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + + newarg.common = (void *)job; + + n_from = 0; + n_to = args -> m; + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifdef USE_ALLOC_HEAP + free(job); +#endif + + return 0; +} + +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); +#endif + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); +#endif + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + newarg.m = n - i - bk; + newarg.k = bk; +#ifndef LOWER + newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; +#else + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; +#endif + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + + thread_driver(&newarg, sa, sb); +#else + +#ifndef LOWER + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif +#else + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif +#endif + +#endif + } + } + return 0; +} From 752fdb5dd8418c0ae56e308067c043b8fe39e695 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Mar 2018 11:56:56 +0200 Subject: [PATCH 03/61] Add workaround for old gcc and clang versions Old gcc and clang do not handle constructor arguments, finally fix #875 as discussed there, using the fedora patch --- driver/others/memory.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d9..1cb7519b45 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -148,8 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) #else +#if __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#elif __GNUC__ && INIT_PRIORITY +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) +#else +#define CONSTRUCTOR +#define DESTRUCTOR #endif #ifdef DYNAMIC_ARCH From 93db123f7e36fafff65d151cf10f95c54dee3608 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Mar 2018 13:13:49 +0200 Subject: [PATCH 04/61] Update memory.c --- driver/others/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1cb7519b45..41937ca323 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -147,8 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else -#if __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) #elif __GNUC__ && INIT_PRIORITY From 01c4b82f045851615074fd1bdc7de06a8b253cf6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 31 Mar 2018 22:32:06 +0200 Subject: [PATCH 05/61] Update memory.c --- driver/others/memory.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 41937ca323..a6d4e636c5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -150,12 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) -#elif __GNUC__ && INIT_PRIORITY +#else #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else -#define CONSTRUCTOR -#define DESTRUCTOR #endif #ifdef DYNAMIC_ARCH From 8da6b6ae52d0bfa86cf4f3935362039f033b13d9 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Mon, 2 Apr 2018 10:48:22 -0700 Subject: [PATCH 06/61] Allow building on OpenBSD With this change, OpenBLAS builds and all tests pass on OpenBSD 6.2 using Clang. Tested on x86-64 only, with and without DYNAMIC_ARCH=1. --- Makefile | 6 +----- Makefile.install | 9 ++------- Makefile.system | 2 +- c_check | 1 + common.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest.c | 4 ++++ driver/others/blas_server.c | 2 +- driver/others/memory.c | 10 +++++----- exports/Makefile | 2 +- getarch.c | 6 +++--- 12 files changed, 22 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 5198f9e2bf..7818b3cd9b 100644 --- a/Makefile +++ b/Makefile @@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index 81d0972151..e22c61da72 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), FreeBSD) - @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" - @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so -endif -ifeq ($(OSNAME), NetBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -115,7 +110,7 @@ endif ifndef NO_SHARED #ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) diff --git a/Makefile.system b/Makefile.system index 9720b317ff..062e14b548 100644 --- a/Makefile.system +++ b/Makefile.system @@ -230,7 +230,7 @@ endif MD5SUM = md5 -r endif -ifeq ($(OSNAME), FreeBSD) +ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD)) MD5SUM = md5 -r endif diff --git a/c_check b/c_check index 20da288bec..a48d58d276 100644 --- a/c_check +++ b/c_check @@ -54,6 +54,7 @@ $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); +$os = OpenBSD if ($data =~ /OS_OPENBSD/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); diff --git a/common.h b/common.h index ae98279ef3..79f15b89ac 100644 --- a/common.h +++ b/common.h @@ -93,7 +93,7 @@ extern "C" { #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_ANDROID) #include #endif diff --git a/common_x86.h b/common_x86.h index 4363fb2f4c..4cf7834733 100644 --- a/common_x86.h +++ b/common_x86.h @@ -327,7 +327,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index bee88d3ce0..4ce2ef7bf3 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -403,7 +403,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define EPILOGUE .end #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index 27d3b473a3..de289ccea8 100644 --- a/ctest.c +++ b/ctest.c @@ -60,6 +60,10 @@ OS_FREEBSD OS_NETBSD #endif +#if defined(__OpenBSD__) +OS_OPENBSD +#endif + #if defined(__sun) OS_SUNOS #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2e0fe190d7..863c587738 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d9..8efe8f0865 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) #include #include #endif @@ -246,7 +246,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) int get_num_procs(void) { @@ -336,7 +336,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -344,7 +344,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -368,7 +368,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/exports/Makefile b/exports/Makefile index 79c251d62d..e5e203053f 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -156,7 +156,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) so : ../$(LIBSONAME) diff --git a/getarch.c b/getarch.c index 24ea5fe5f3..94c6ae6a4b 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) #include #include #endif @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); From a41d241a0e9fe70c95e9ce1e406d5c57fd2d593b Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 16:39:29 -0700 Subject: [PATCH 07/61] Add support for DragonFly BSD --- Makefile | 2 +- Makefile.install | 4 ++-- Makefile.system | 2 +- c_check | 1 + common.h | 2 +- common_x86_64.h | 2 +- ctest.c | 4 ++++ driver/others/blas_server.c | 2 +- driver/others/memory.c | 10 +++++----- getarch.c | 6 +++--- 10 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 7818b3cd9b..c0e5fbcf82 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index e22c61da72..9ce5ceae6e 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,7 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -110,7 +110,7 @@ endif ifndef NO_SHARED #ifeq logical or -ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) diff --git a/Makefile.system b/Makefile.system index 062e14b548..d504a11119 100644 --- a/Makefile.system +++ b/Makefile.system @@ -230,7 +230,7 @@ endif MD5SUM = md5 -r endif -ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD)) +ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) MD5SUM = md5 -r endif diff --git a/c_check b/c_check index a48d58d276..a3b3376024 100644 --- a/c_check +++ b/c_check @@ -55,6 +55,7 @@ $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); $os = OpenBSD if ($data =~ /OS_OPENBSD/); +$os = DragonFly if ($data =~ /OS_DRAGONFLY/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); diff --git a/common.h b/common.h index 79f15b89ac..5a599a5afa 100644 --- a/common.h +++ b/common.h @@ -93,7 +93,7 @@ extern "C" { #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_ANDROID) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) #include #endif diff --git a/common_x86_64.h b/common_x86_64.h index 4ce2ef7bf3..1cc71506a5 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -403,7 +403,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define EPILOGUE .end #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index de289ccea8..00be423d1c 100644 --- a/ctest.c +++ b/ctest.c @@ -64,6 +64,10 @@ OS_NETBSD OS_OPENBSD #endif +#if defined(__DragonFly__) +OS_DRAGONFLY +#endif + #if defined(__sun) OS_SUNOS #endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 863c587738..794dfb20ea 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) #include #include #include diff --git a/driver/others/memory.c b/driver/others/memory.c index 8efe8f0865..6920efaaa3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -246,7 +246,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -336,7 +336,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -344,7 +344,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -368,7 +368,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif diff --git a/getarch.c b/getarch.c index 94c6ae6a4b..992fc2b953 100644 --- a/getarch.c +++ b/getarch.c @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef OS_WINDOWS #include #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) #include #include #endif @@ -1074,7 +1074,7 @@ static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) int m[2], count; size_t len; #endif @@ -1088,7 +1088,7 @@ static int get_num_cores(void) { GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; -#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); From 33f838393cb3870723774d51afc27405fa2c6429 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 16:42:01 -0700 Subject: [PATCH 08/61] Add OpenBSD and DragonFly to community supported platforms --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ec32c1f60e..b5449a45e4 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,8 @@ Please read GotoBLAS_01Readme.txt - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. +- **OpenBSD**: Supported by community. We didn't test the library on this OS. +- **DragonFly BSD**: Supported by community. We didn't test the library on this OS. - **Android**: Supported by community. Please read . ## Usages From bb9876db33952cf9e2636edda50b6cb0eb6f5912 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 18:16:52 +0200 Subject: [PATCH 09/61] Fix thread races and infinite looping on systems with many cpus On systems with more than 64 cpus, blas_quickdivide will sometimes return zero which creates bogus workloads when used for the stride calculation. This then leads to threads spinning incessantly waiting for a status change that never happens, as seen in #1497. This patch also fixes several data races that were found by helgrind and/or tsan while debugging the issue. --- lapack/getrf/getrf_parallel.c | 96 ++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index db8c836e0a..91d97a7916 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,6 +67,26 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_lock = 0; +#else +static BLASULONG getrf_lock = 0UL; +#endif + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t getrf_flag_lock = 0; +#else +static BLASULONG getrf_flag_lock = 0UL; +#endif + + + + static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); @@ -217,7 +237,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; - volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + //_Atomic + BLASLONG jw; + + _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); @@ -245,8 +268,20 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (i = 0; i < args -> nthreads; i++) +#if 1 + { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw); + } +#else while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; - +#endif for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -283,18 +318,23 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * b + (is + jjs * lda) * COMPSIZE, lda, is); } } - MB; - for (i = 0; i < args -> nthreads; i++) + for (i = 0; i < args -> nthreads; i++) { +LOCK_COMMAND(&getrf_lock); job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - +UNLOCK_COMMAND(&getrf_lock); + } } +LOCK_COMMAND(&getrf_flag_lock); flag[mypos * CACHE_LINE_SIZE] = 0; +UNLOCK_COMMAND(&getrf_flag_lock); if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { +LOCK_COMMAND(&getrf_lock); job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; +UNLOCK_COMMAND(&getrf_lock); } } @@ -318,7 +358,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if ((current != mypos) && (!is)) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); + } while (jw == 0); +#else while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; +#endif } KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, @@ -327,7 +378,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * MB; if (is + min_i >= m) { +LOCK_COMMAND(&getrf_lock); job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; +UNLOCK_COMMAND(&getrf_lock); } } @@ -339,7 +392,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { +#if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + do { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); + } while(jw != 0); +#else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; +#endif } } @@ -374,6 +438,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG i, j, k, is, bk; BLASLONG num_cpu; + BLASLONG f; #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; @@ -501,11 +566,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; if (nn <= 0) width = mm; mm -= width; @@ -514,11 +581,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } else { width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = mm; if (mm < width) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (width == 0) width = nn; if (nn < width) width = nn; if (mm <= 0) width = nn; nn -= width; @@ -561,7 +630,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[1] = offset + is + bk; if (num_cpu > 0) { - queue[num_cpu - 1].next = NULL; exec_blas_async(0, &queue[0]); @@ -572,8 +640,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo + is; - for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; - + for (i = 0; i < num_cpu; i ++) { +#if 1 + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + while (f!=0) { + LOCK_COMMAND(&getrf_flag_lock); + f=flag[i*CACHE_LINE_SIZE]; + UNLOCK_COMMAND(&getrf_flag_lock); + }; +#else + while (flag[i*CACHE_LINE_SIZE]) {}; +#endif + } TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { From 36a17536ca739cea2c773a478b7bc0688cd59434 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Tue, 3 Apr 2018 15:09:25 -0700 Subject: [PATCH 10/61] Compile with cc rather than gcc whenever possible --- Makefile.system | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Makefile.system b/Makefile.system index d504a11119..769628e982 100644 --- a/Makefile.system +++ b/Makefile.system @@ -17,16 +17,20 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # http://stackoverflow.com/questions/4029274/mingw-and-make-variables # - Default value is 'cc' which is not always a valid command (e.g. MinGW). ifeq ($(origin CC),default) + +# Check if $(CC) refers to a valid command and set the value to gcc if not +ifneq ($(findstring cmd.exe,$(SHELL)),) +ifeq ($(shell where $(CC) 2>NUL),) +CC = gcc +endif +else # POSIX +ifeq ($(shell command -v $(CC) 2>/dev/null),) CC = gcc -# Change the default compile to clang on Mac OSX. -# http://stackoverflow.com/questions/714100/os-detecting-makefile -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Darwin) - CC = clang -# EXTRALIB += -Wl,-no_compact_unwind endif endif +endif # CC is set to default + # Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE From 8f811a9312f6692c084c25fa78c45827accb7103 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Wed, 4 Apr 2018 11:41:45 -0700 Subject: [PATCH 11/61] Reinstate macOS logic --- Makefile.system | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 769628e982..142cb420f8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -23,11 +23,16 @@ ifneq ($(findstring cmd.exe,$(SHELL)),) ifeq ($(shell where $(CC) 2>NUL),) CC = gcc endif -else # POSIX +else # POSIX-ish ifeq ($(shell command -v $(CC) 2>/dev/null),) +ifeq ($(shell uname -s),Darwin) +CC = clang +# EXTRALIB += -Wl,-no_compact_unwind +else CC = gcc -endif -endif +endif # Darwin +endif # CC exists +endif # Shell is sane endif # CC is set to default From ca8ca796d3d6d35b33f879d3af75567fcb7348c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 22:26:51 +0200 Subject: [PATCH 12/61] Underline importance of NUM_THREADS setting for BUFFER allocation following augray's suggestion from #1451, and incorporating ashwinyes' comments from #1141 on the importance of NUM_THREADS even for single-threaded builds. --- USAGE.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/USAGE.md b/USAGE.md index c76ceb3247..89f3bba676 100644 --- a/USAGE.md +++ b/USAGE.md @@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. +Despite its name, and due to the use of memory buffers in functions like SGEMM, +the setting of NUM_THREADS can be relevant even for a single-threaded build +of OpenBLAS, if such functions get called by multiple threads of a program +that uses OpenBLAS. In some cases, the affected code may simply crash or throw +a segmentation fault without displaying the above warning first. + +Note that the number of threads used at runtime can be altered to differ from the +value NUM_THREADS was set to at build time. At runtime, the actual number of +threads can be set anywhere from 1 to the build's NUM_THREADS (note however, +that this does not change the number of memory buffers that will be allocated, +which is set at build time). The number of threads for a process can be set by +using the mechanisms described below. + + #### How can I use OpenBLAS in multi-threaded applications? If your application is already multi-threaded, it will conflict with OpenBLAS From 8ec28ff4619b8e6fc2c88543f2902e8f95948ae0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Apr 2018 22:40:30 +0200 Subject: [PATCH 13/61] Remove unguarded use of _Atomic and fix tabbing --- lapack/getrf/getrf_parallel.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 91d97a7916..b48765e55f 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -237,10 +237,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; - //_Atomic BLASLONG jw; - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); @@ -320,21 +319,21 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * } MB; for (i = 0; i < args -> nthreads; i++) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } -LOCK_COMMAND(&getrf_flag_lock); + LOCK_COMMAND(&getrf_flag_lock); flag[mypos * CACHE_LINE_SIZE] = 0; -UNLOCK_COMMAND(&getrf_flag_lock); + UNLOCK_COMMAND(&getrf_flag_lock); if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } @@ -378,9 +377,9 @@ UNLOCK_COMMAND(&getrf_lock); MB; if (is + min_i >= m) { -LOCK_COMMAND(&getrf_lock); + LOCK_COMMAND(&getrf_lock); job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; -UNLOCK_COMMAND(&getrf_lock); + UNLOCK_COMMAND(&getrf_lock); } } From 137ccd9dd96468ce26cea78ba75a70a7f3c73079 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Wed, 4 Apr 2018 14:30:32 -0700 Subject: [PATCH 14/61] Minor changes to wording and formatting in the README The wording in some places is not grammatically correct. This change also provides minor adjustments to the Markdown formatting which provide modest improvements to readability. --- README.md | 228 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 135 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index b5449a45e4..02d0873344 100644 --- a/README.md +++ b/README.md @@ -5,177 +5,219 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) + ## Introduction + OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documents on OpenBLAS wiki pages . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages -We provide binary packages for the following platform. + +We provide official binary packages for the following platform: * Windows x86/x86_64 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). ## Installation from Source -Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, check out codes from git://github.com/xianyi/OpenBLAS.git -### Normal compile - * type "make" to detect the CPU automatically. - or - * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. +Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code +using Git from https://github.com/xianyi/OpenBLAS.git. -### Cross compile -Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. +### Dependencies -Examples: +Building OpenBLAS requires the following to be installed: -On X86 box, compile this library for loongson3a CPU. +* GNU Make +* A C compiler, e.g. GCC or Clang +* A Fortran compiler (optional, for LAPACK) +* IBM MASS (optional, see below) - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A +### Normal compile -On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. +Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. +To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. +The full target list is in the file `TargetList.txt`. - make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 +### Cross compile -### Debug version +Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. +The target must be specified explicitly when cross compiling. + +Examples: - make DEBUG=1 +* On an x86 box, compile this library for a loongson3a CPU: + ```sh + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + ``` -### Compile with MASS Support on Power CPU (Optional dependency) +* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: + ```sh + make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 + ``` -[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and -Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. -The library can be installed as below - +### Debug version - * On Ubuntu: +A debug version can be built using `make DEBUG=1`. - wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
- echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
- sudo apt-get update
- sudo apt-get install libxlmass-devel.8.1.5
+### Compile with MASS support on Power CPU (optional) - * On RHEL/CentOS: +The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library +consists of a set of mathematical functions for C, C++, and Fortran applications that are +are tuned for optimum performance on POWER architectures. +OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. +The library can be installed as shown: - wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
- sudo rpm --import repomd.xml.key
- wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
- sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
- sudo yum install libxlmass-devel.8.1.5
+* On Ubuntu: + ```sh + wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - + echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list + sudo apt-get update + sudo apt-get install libxlmass-devel.8.1.5 + ``` -After installing MASS library, compile openblas with USE_MASS=1. +* On RHEL/CentOS: + ```sh + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key + sudo rpm --import repomd.xml.key + wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo + sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ + sudo yum install libxlmass-devel.8.1.5 + ``` -Example: +After installing the MASS library, compile OpenBLAS with `USE_MASS=1`. +For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`. -Compiling on Power8 with MASS support - +### Install to a specific directory (optional) - make USE_MASS=1 TARGET=POWER8 +Use `PREFIX=` when invoking `make`, for example -### Install to the directory (optional) +```sh +make install PREFIX=your_installation_directory +``` -Example: +The default installation directory is `/opt/OpenBLAS`. - make install PREFIX=your_installation_directory +## Supported CPUs and Operating Systems -The default directory is /opt/OpenBLAS +Please read `GotoBLAS_01Readme.txt`. -## Support CPU & OS -Please read GotoBLAS_01Readme.txt +### Additional supported CPUs -### Additional support CPU: +#### x86/x86-64 -#### x86/x86-64: - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) +- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -#### MIPS64: +#### MIPS64 + - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3B**: Experimental -#### ARM: -- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) -- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) +#### ARM -#### ARM64: -- **ARMV8**: Experimental +- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+) +- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15) + +#### ARM64 + +- **ARMv8**: Experimental - **ARM Cortex-A57**: Experimental #### PPC/PPC64 -- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1 -#### IBM zEnterprise System: -- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - +- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` -### Support OS: -- **GNU/Linux** -- **MingWin or Visual Studio(CMake)/Windows**: Please read . -- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. -- **FreeBSD**: Supported by community. We didn't test the library on this OS. -- **OpenBSD**: Supported by community. We didn't test the library on this OS. -- **DragonFly BSD**: Supported by community. We didn't test the library on this OS. -- **Android**: Supported by community. Please read . +#### IBM zEnterprise System -## Usages -Link with libopenblas.a or -lopenblas for shared library. +- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) -### Set the number of threads with environment variables. +### Supported OS -Examples: +- **GNU/Linux** +- **MinGW or Visual Studio (CMake)/Windows**: Please read . +- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. +- **FreeBSD**: Supported by the community. We don't actively test the library on this OS. +- **OpenBSD**: Supported by the community. We don't actively test the library on this OS. +- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. +- **Android**: Supported by the community. Please read . - export OPENBLAS_NUM_THREADS=4 +## Usage - or +Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was +compiled as a shared library. - export GOTO_NUM_THREADS=4 +### Setting the number of threads using environment variables - or +Environment variables are used to specify a maximum number of threads. +For example, - export OMP_NUM_THREADS=4 +```sh +export OPENBLAS_NUM_THREADS=4 +export GOTO_NUM_THREADS=4 +export OMP_NUM_THREADS=4 +``` -The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. +The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`. -If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. +If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS` +environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when +compiled with `USE_OPENMP=1`. -### Set the number of threads on runtime. +### Setting the number of threads at runtime -We provided the below functions to control the number of threads on runtime. +We provide the following functions to control the number of threads at runtime: - void goto_set_num_threads(int num_threads); +```c +void goto_set_num_threads(int num_threads); +void openblas_set_num_threads(int num_threads); +``` - void openblas_set_num_threads(int num_threads); +If you compile this library with `USE_OPENMP=1`, you should use the above functions too. -If you compile this lib with USE_OPENMP=1, you should use the above functions, too. +## Reporting bugs -## Report Bugs -Please add a issue in https://github.com/xianyi/OpenBLAS/issues +Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. ## Contact + * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev -## ChangeLog -Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. +## Change log + +Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. ## Troubleshooting -* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. -* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. -* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. -* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. -* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. +* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. + Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), + there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build + the library with `BIGNUMA=1`. +* OpenBLAS does not set processor affinity by default. + On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in + Makefile.rule. However, note that this may cause + [a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). +* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`). + However, it will be okay when you run the same test case on the shell. ## Contributing -1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. -1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. -1. Write a test which shows that the bug was fixed or that the feature works as expected. -1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. + +1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue + to start a discussion around a feature idea or a bug. +2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. +3. Write a test which shows that the bug was fixed or that the feature works as expected. +4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. ## Donation + Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). From 24f8d5b62413543148ea6f9a44cac875f95a6387 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Fri, 6 Apr 2018 17:30:10 -0700 Subject: [PATCH 15/61] Add DragonFly to exports/Makefile Its exclusion was an oversight on my part. --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index e5e203053f..53d4f75bbc 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -156,7 +156,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) so : ../$(LIBSONAME) From 6a0930560e6fc5dc4ce204cf11ca8f9818c7fddc Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Fri, 6 Apr 2018 17:53:58 -0700 Subject: [PATCH 16/61] Add macOS to the Travis testing matrix --- .travis.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0b280c2fce..e599c75e72 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ language: c jobs: include: - &test-ubuntu + os: linux stage: test compiler: gcc addons: @@ -57,7 +58,8 @@ jobs: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" - - stage: test + - os: linux + stage: test compiler: gcc addons: apt: @@ -77,6 +79,7 @@ jobs: # which is slower than container-based infrastructure used for jobs # that don't require sudo. - &test-alpine + os: linux stage: test dist: trusty sudo: true @@ -120,6 +123,7 @@ jobs: - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake + os: linux stage: test compiler: clang addons: @@ -147,6 +151,17 @@ jobs: env: - CMAKE=1 + - os: osx + stage: test + osx_image: xcode8 + before_script: *common-before + - brew update + - brew install gcc # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="BINARY=64 INTERFACE64=1" + # whitelist branches: only: From daae8fd197fd71691bf5432445d78ce17e83a039 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Apr 2018 13:27:24 +0200 Subject: [PATCH 17/61] Revert "Add macOS to the Travis testing matrix" --- .travis.yml | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.travis.yml b/.travis.yml index e599c75e72..0b280c2fce 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ language: c jobs: include: - &test-ubuntu - os: linux stage: test compiler: gcc addons: @@ -58,8 +57,7 @@ jobs: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" - - os: linux - stage: test + - stage: test compiler: gcc addons: apt: @@ -79,7 +77,6 @@ jobs: # which is slower than container-based infrastructure used for jobs # that don't require sudo. - &test-alpine - os: linux stage: test dist: trusty sudo: true @@ -123,7 +120,6 @@ jobs: - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake - os: linux stage: test compiler: clang addons: @@ -151,17 +147,6 @@ jobs: env: - CMAKE=1 - - os: osx - stage: test - osx_image: xcode8 - before_script: *common-before - - brew update - - brew install gcc # for gfortran - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="BINARY=64 INTERFACE64=1" - # whitelist branches: only: From 2e988dbf35211e2346d694ad534166c49b297703 Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Sat, 7 Apr 2018 10:56:34 -0700 Subject: [PATCH 18/61] Add macOS to the Travis testing matrix --- .travis.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0b280c2fce..3460db719b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ language: c jobs: include: - &test-ubuntu + os: linux stage: test compiler: gcc addons: @@ -57,7 +58,8 @@ jobs: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" - - stage: test + - os: linux + stage: test compiler: gcc addons: apt: @@ -77,6 +79,7 @@ jobs: # which is slower than container-based infrastructure used for jobs # that don't require sudo. - &test-alpine + os: linux stage: test dist: trusty sudo: true @@ -120,6 +123,7 @@ jobs: - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - &test-cmake + os: linux stage: test compiler: clang addons: @@ -147,6 +151,18 @@ jobs: env: - CMAKE=1 + - os: osx + stage: test + osx_image: xcode8 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - brew update + - brew install gcc # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="BINARY=64 INTERFACE64=1" + # whitelist branches: only: From b966bd79d5499d37e15b72bda0aad2ef4167b45f Mon Sep 17 00:00:00 2001 From: Alex Arslan Date: Sat, 7 Apr 2018 12:29:57 -0700 Subject: [PATCH 19/61] Add a BINARY=32 build to macOS --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3460db719b..4a25e7121a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -151,7 +151,8 @@ jobs: env: - CMAKE=1 - - os: osx + - &test-macos + os: osx stage: test osx_image: xcode8 before_script: @@ -163,6 +164,10 @@ jobs: env: - BTYPE="BINARY=64 INTERFACE64=1" + - <<: *test-macos + env: + - BTYPE="BINARY=32" + # whitelist branches: only: From 0fe434598b177fd2d4563b873d5cdd8eb05731a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Apr 2018 23:30:59 +0200 Subject: [PATCH 20/61] Fix precision of mips dsdot --- kernel/mips/dot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c index de7f7167f8..cbd3efc642 100644 --- a/kernel/mips/dot.c +++ b/kernel/mips/dot.c @@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n) { - - dot += y[iy] * x[ix] ; +#if defined(DSDOT) + dot += (double)(y[iy] * (double)x[ix] ; +#else + dot += y[iy] * x[ix]; +#endif ix += inc_x ; iy += inc_y ; i++ ; From 7c861605b22d1d88d26f54bea44319edce2b1b25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 14 Apr 2018 18:29:10 +0200 Subject: [PATCH 21/61] Catch invalid cpu count returned by CPU_COUNT_S mips32 was seen to return zero here, driving nthreads to zero with subsequent fpe in blas_quickdivide --- driver/others/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 474d97c4d9..93f185e2f5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -209,7 +209,8 @@ int ret; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; - nums = CPU_COUNT_S(size,cpusetp); + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif From 734d7c6a93d3350e23f897123fb2a0ea14c3ade1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 14 Apr 2018 18:59:46 +0200 Subject: [PATCH 22/61] Include sys/types.h for proper typedefs related to wait() Should fix #1519 --- utest/test_fork.c | 1 + 1 file changed, 1 insertion(+) diff --git a/utest/test_fork.c b/utest/test_fork.c index e7a8dbcee3..9e0244305b 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" +#include #include #include From 68a3c4fca60461f69fbec2da80454fde022b1adc Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 19 Apr 2018 09:05:25 +0000 Subject: [PATCH 23/61] ARM64: Enable Auto Detection of ThunderX2T99 --- cpuid_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index bd7fb7f2da..a42346c888 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -121,7 +121,7 @@ int detect(void) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX; - else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ + else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX2T99; } From 5fcaca6438855fa295e7fa012ffa38f12599ede7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Apr 2018 15:42:13 +0200 Subject: [PATCH 24/61] fork utest depends on CBLAS --- utest/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index e40b3c6db5..e071540dc1 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -17,11 +17,13 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch +ifneq ($(NO_CBLAS), 1) ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) OBJS += test_fork.o endif endif +endif all : run_test From 625c74a38f481e8ed818334abffd493448f77ebd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Apr 2018 15:43:59 +0200 Subject: [PATCH 25/61] fork utest depends on CBLAS --- utest/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7e..77a42d84f3 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -25,6 +25,7 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms +if (NOT NO_CBLAS) if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) set(OpenBLAS_utest_src @@ -33,6 +34,7 @@ set(OpenBLAS_utest_src ) endif() endif() +endif() if (NOT NO_LAPACK) set(OpenBLAS_utest_src From 9c5518319a1370984abe9a2a55a5ebeb1deeccf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Apr 2018 20:20:04 +0200 Subject: [PATCH 26/61] Revert "Fix 32bit HASWELL builds" --- kernel/Makefile.L3 | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4284fbfa09..0664263967 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,8 @@ USE_TRMM = 1 endif ifeq ($(CORE), HASWELL) -ifeq ($(ARCH), x86_64) USE_TRMM = 1 endif -endif ifeq ($(CORE), ZEN) USE_TRMM = 1 From 8a3b6fa108b15331c2af8777d1ea0206f85673b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Apr 2018 19:05:49 +0200 Subject: [PATCH 27/61] =?UTF-8?q?Use=20generic=20zrot.c=20on=20ppc64/POWER?= =?UTF-8?q?6=20to=20work=20around=20utest=20failure=20from=20=E2=80=A6=20(?= =?UTF-8?q?#1535)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Use generic C implementation of zrot on ppc64/POWER6 to work around utest failure from #1469 --- kernel/power/KERNEL.POWER6 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 index 344b205fee..e6d2c9a513 100644 --- a/kernel/power/KERNEL.POWER6 +++ b/kernel/power/KERNEL.POWER6 @@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c From 125343cc886accee268b03f020c09658cff37509 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Apr 2018 22:39:50 +0200 Subject: [PATCH 28/61] Drop test for zero incx,incy in armv7 AXPY ...to pass the related utest (see #1469) --- kernel/arm/axpy_vfp.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 37515f3996..c35b8aece9 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble axpy_kernel_L999 - +/* cmp INC_X, #0 beq axpy_kernel_L999 cmp INC_Y, #0 beq axpy_kernel_L999 - +*/ cmp INC_X, #1 bne axpy_kernel_S_BEGIN From 2d0929fa7c969cbe8f7dcbf3e5b16ef1301dc6a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Apr 2018 22:43:00 +0200 Subject: [PATCH 29/61] Move the test for zero incx,incy in ARMV7 ROT to pass the related utest (see #1469) --- kernel/arm/rot_vfp.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index 25f5636906..ea296dbc5c 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble rot_kernel_L999 - +/* cmp INC_X, #0 beq rot_kernel_L999 cmp INC_Y, #0 beq rot_kernel_L999 - +*/ cmp INC_X, #1 bne rot_kernel_S_BEGIN @@ -584,6 +584,12 @@ rot_kernel_S1: rot_kernel_S10: KERNEL_S1 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 subs I, I, #1 bne rot_kernel_S10 From a8ed428bab10bc595493c1c3c029a5e8d6f25637 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 25 Apr 2018 22:35:46 +0200 Subject: [PATCH 30/61] Disable multithreading in ztrmv BLAS-Tester shows that the same problem exists as with DTRMV (issue #1332) --- interface/ztrmv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 4c47e9e913..0e16632e06 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; +/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ + nthreads = 1; + if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } From 941ad280a8626adc621e41188f513e3fc8ab1e10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 25 Apr 2018 22:50:10 +0200 Subject: [PATCH 31/61] Fix typo in MIPS P5600 complex ASUM code selection --- kernel/mips/KERNEL.P5600 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a16704d5e..1ab1930698 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c else SASUMKERNEL = ../mips/asum.c DASUMKERNEL = ../mips/asum.c -CASUMKERNEL = ../mips/asum.c -ZASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c endif ifdef HAVE_MSA @@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -endif \ No newline at end of file +endif From 1b83341d194b9d8f75ec724b0c5ae64144ca3108 Mon Sep 17 00:00:00 2001 From: Zhiyong Dang Date: Tue, 24 Apr 2018 10:34:53 +0800 Subject: [PATCH 32/61] Fix race condition in blas_server_omp.c Change-Id: Ic896276cd073d6b41930c7c5a29d66348cd1725d --- Makefile.rule | 7 +++ Makefile.system | 6 +++ cmake/system.cmake | 6 +++ common.h | 2 +- driver/others/blas_server_omp.c | 91 ++++++++++++++++++++++++--------- 5 files changed, 86 insertions(+), 26 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 62bf63df47..0ce4c40a81 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev # automatically detected by the the script. # NUM_THREADS = 24 +# If you have enabled USE_OPENMP and your application would call +# OpenBLAS's caculation API in multi threads, please comment it in. +# This flag define how many OpenBLAS's caculation API can actually +# run in parallel. If more number threads call OpenBLAS's caculation API, +# it would wait former API finish. +# NUM_PARALLEL = 2 + # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 diff --git a/Makefile.system b/Makefile.system index 142cb420f8..463b857b84 100644 --- a/Makefile.system +++ b/Makefile.system @@ -184,6 +184,10 @@ endif endif +ifndef NUM_PARALLEL +NUM_PARALLEL = 1 +endif + ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif @@ -961,6 +965,8 @@ endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) + ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fdd9390cd..6458956710 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() +if (NOT DEFINED NUM_PARALLEL) + set(NUM_PARALLEL 1) +endif() + if (NOT DEFINED NUM_THREADS) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) # HT? @@ -224,6 +228,8 @@ endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common.h b/common.h index 5a599a5afa..86c33b2fd2 100644 --- a/common.h +++ b/common.h @@ -179,7 +179,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 8d62a81256..868db3b1d3 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,6 +36,13 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +#if _STDC_VERSION__ >= 201112L +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif +#include #include #include //#include @@ -49,11 +56,16 @@ int blas_server_avail = 0; -static void * blas_thread_buffer[MAX_CPU_NUMBER]; +static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; +#if _STDC_VERSION__ >= 201112L +static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#else +static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#endif void goto_set_num_threads(int num_threads) { - int i=0; + int i=0, j=0; if (num_threads < 1) num_threads = blas_num_threads; @@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread - for(i=0; i mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); - buffer = blas_thread_buffer[pos]; + buffer = blas_thread_buffer[buf_index][pos]; //fallback if(buffer==NULL) { @@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){ - BLASLONG i; + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; @@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif + while(true) { + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { +#if _STDC_VERSION__ >= 201112L + _Bool inuse = false; + if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { +#else + if(blas_buffer_inuse[i] == false) { + blas_buffer_inuse[i] = true; +#endif + buf_index = i; + break; + } + } + if(i != MAX_PARALLEL_NUMBER) + break; + } + #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { @@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ queue[i].position = i; #endif - exec_threads(&queue[i]); + exec_threads(&queue[i], buf_index); } +#if _STDC_VERSION__ >= 201112L + atomic_store(&blas_buffer_inuse[buf_index], false); +#else + blas_buffer_inuse[buf_index] = false; +#endif + return 0; } From 894433a7c71fba89b41af08acdd8fea7b48cc666 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Apr 2018 12:08:06 +0200 Subject: [PATCH 33/61] Update Makefile.rule --- Makefile.rule | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 0ce4c40a81..12734464bb 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -61,10 +61,10 @@ VERSION = 0.3.0.dev # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's caculation API in multi threads, please comment it in. -# This flag define how many OpenBLAS's caculation API can actually -# run in parallel. If more number threads call OpenBLAS's caculation API, -# it would wait former API finish. +# OpenBLAS's calculation API from multi threads, please comment it in. +# This flag defines how many instances of OpenBLAS's calculation API can +# actually run in parallel. If more threads call OpenBLAS's calculation API, +# they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 # if you don't need to install the static library, please comment it in. From 26ce518d4605db37083404615268b2341340ecb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:34:33 +0200 Subject: [PATCH 34/61] Avoid out of bounds reads from blas_quick_divide_table on big systems Should fix #1541 --- common_x86_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index bee88d3ce0..0542653a10 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -195,7 +195,9 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; - + + if (y > 64) return x/y; + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From 8145ecd70bdfae44f62b5ff9a9e0ee427a2db3db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:38:55 +0200 Subject: [PATCH 35/61] Avoid out-of-bounds reads from blas_quick_divide_table on big systems --- common_x86.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_x86.h b/common_x86.h index 4363fb2f4c..de014064ee 100644 --- a/common_x86.h +++ b/common_x86.h @@ -179,6 +179,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ return result; #else + if ( y > 64) { + result = x/y; + return result; + } y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From c1eb06e102f4598efee7f766bf0142653f8c8f73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Apr 2018 14:40:12 +0200 Subject: [PATCH 36/61] Update common_x86_64.h --- common_x86_64.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index 0542653a10..a145abc14c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -196,7 +196,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; - if (y > 64) return x/y; + if (y > 64) { + result = x / y; + return result; + } y = blas_quick_divide_table[y]; From e93355e5e1fe1b00a7a9587118c5d3b58ce94922 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 14:43:08 +0200 Subject: [PATCH 37/61] Omit the table overflow check when building for small systems --- common_x86.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common_x86.h b/common_x86.h index de014064ee..75b1e1247f 100644 --- a/common_x86.h +++ b/common_x86.h @@ -178,11 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ result = x/y; return result; #else - +#if (MAX_CPU_NUMBER > 64) if ( y > 64) { result = x/y; return result; - } + } +#endif + y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); From d0c0506588281b34717a3e7b17e9cc2c4a5cef8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 14:44:50 +0200 Subject: [PATCH 38/61] Omit the divide table overflow check on small systems --- common_x86_64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_x86_64.h b/common_x86_64.h index a145abc14c..9d0ef4e752 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -195,11 +195,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; - + +#if (MAX_CPU_NUMBER > 64) if (y > 64) { result = x / y; return result; } +#endif y = blas_quick_divide_table[y]; From 3af1b5c805a5831d20dcae416d362e7e87515e53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:12:25 +0200 Subject: [PATCH 39/61] Make cpuid_mips compile again and add 1004K cpu --- cpuid_mips.c | 58 ++++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 15c58959e0..c09902936a 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_P5600 1 +#define CPU_1004K 2 static char *cpuname[] = { "UNKOWN", - "P5600" + "P5600", + "1004K" }; int detect(void){ @@ -90,7 +92,7 @@ int detect(void){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 - fprintf(stderr, "%s\n", p); + fprintf(stderr, "%s \n", p); #endif break; } @@ -99,43 +101,13 @@ int detect(void){ fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ + if (strstr(p, "5600")) { + return CPU_P5600; + } else if (strstr(p, "1004K")) { + return CPU_1004K; + } else return CPU_UNKNOWN; } - } - //Check model name for Loongson3 - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("model name", buffer, 10)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } - } #endif return CPU_UNKNOWN; } @@ -149,7 +121,7 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_P5600){ + if(detect()==CPU_P5600|| detect()==CPU_1004K){ printf("P5600"); }else{ printf("UNKNOWN"); @@ -170,6 +142,14 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); + } else if (detect()==CPU_1004K) { + printf("#define MIPS1004K\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 26144\n"); + printf("#define DTB_DEFAULT_ENTRIES 8\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); }else{ printf("#define UNKNOWN\n"); } @@ -178,6 +158,8 @@ void get_cpuconfig(void){ void get_libname(void){ if(detect()==CPU_P5600) { printf("p5600\n"); + } else if (detect()==CPU_1004K) { + printf("1004K\n"); }else{ printf("mips\n"); } From d94d7baf7ea5010af47a71a4e01febb08c0d535c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:17:26 +0200 Subject: [PATCH 40/61] Add mips32r2 api target --- Makefile.prebuild | 4 ++++ Makefile.system | 9 +++++++-- param.h | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index daa556f65b..a366004a12 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,6 +17,10 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif +ifeq ($(TARGET), 1004K) +TARGET_FLAGS = -mips32r2 +endif + ifeq ($(TARGET), P5600) TARGET_FLAGS = -mips32r5 endif diff --git a/Makefile.system b/Makefile.system index 142cb420f8..fdc4087814 100644 --- a/Makefile.system +++ b/Makefile.system @@ -564,9 +564,14 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif +ifeq ($(CORE), 1004K) +CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) +endif + ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +CCOMMON_OPT += -mips32r5 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) diff --git a/param.h b/param.h index 189cdc4a02..4227d548e8 100644 --- a/param.h +++ b/param.h @@ -2291,7 +2291,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From 9d5098dbc94cf3bfdc8e9e85043cf285d27cf0da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:20:44 +0200 Subject: [PATCH 41/61] Add MIPS 1004K target (Mediatek MT7621 SOC) --- kernel/mips/KERNEL.1004K | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/mips/KERNEL.1004K diff --git a/kernel/mips/KERNEL.1004K b/kernel/mips/KERNEL.1004K new file mode 100644 index 0000000000..67135356e2 --- /dev/null +++ b/kernel/mips/KERNEL.1004K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 From 018f2dad27c764d912fb5ad6cf8bf560f05f2d63 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:25:32 +0200 Subject: [PATCH 42/61] Switch mips32 target to USE_TRMM to fix complex TRMM --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0664263967..4d2999b678 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -20,6 +20,10 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif +ifeq ($(ARCH), mips) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif From 73cc321190a5c1ba6004ecfa6df8b19321b3ed49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:27:56 +0200 Subject: [PATCH 43/61] Add MIPS 1004K target --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index d40545cf80..aeeaa9ede3 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -56,6 +56,7 @@ CELL 3.MIPS CPU: P5600 +1004K 4.MIPS64 CPU: SICORTEX From 71051259e060abc797eb59a6cc718c2f2dd2f1d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 May 2018 20:37:06 +0200 Subject: [PATCH 44/61] Restore compiler options for mips P5600 target --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index fdc4087814..f2fdc5c4bf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -570,8 +570,8 @@ FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) endif ifeq ($(CORE), P5600) -CCOMMON_OPT += -mips32r5 $(MSA_FLAGS) -FCOMMON_OPT += -mips32r5 $(MSA_FLAGS) +CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) +FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) From 5966fd52a23683a10995202dba3e781e9dfcbf9f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:36:56 +0200 Subject: [PATCH 45/61] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/chetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 6645121c19..91806bb1d6 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -512,7 +512,7 @@ SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) From 1a8e487c4a88ef0759efc2d13b9ff3c825a7a57c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:38:25 +0200 Subject: [PATCH 46/61] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/dsytrd_sb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index d1ccc1a892..4ca0507e48 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -481,7 +481,7 @@ SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) From 9795adc7efb176afb72103ddfd447f92c2579387 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:39:42 +0200 Subject: [PATCH 47/61] Drop C-style "L" suffix from OPENMP version number in check --- lapack-netlib/SRC/zhetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 7b623481b5..508afca06e 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -512,7 +512,7 @@ SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * * Call the kernel * -#if defined(_OPENMP) && _OPENMP >= 201307L +#if defined(_OPENMP) && _OPENMP >= 201307 IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) From 65b8a5c5d876c25bc1387c7228535e6c7d3147ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:47:10 +0200 Subject: [PATCH 48/61] Update compiler flag for openmp use with ICC The deprecated -openmp option was finally removed in favor of -qopenmp or -fopenmp, picking the latter to stay compatible with Intel compiler versions before 2015 (when -q options were introduced). Fixes #1546 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 142cb420f8..3a3e9f5100 100644 --- a/Makefile.system +++ b/Makefile.system @@ -433,7 +433,7 @@ CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), INTEL) -CCOMMON_OPT += -openmp +CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), PGI) From d2b9389f1b49c8dee358f3e7211ac4ac707f0dd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 May 2018 21:55:37 +0200 Subject: [PATCH 49/61] Fixes for ifort 2018 1. the already deprecated -openmp option was removed in 2018, switch to -fopenmp 2. add leading blank in search for "zho_ge__" symbol to work around misleading tags in the 2018 assembly Expected to fix #1548 --- f_check | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/f_check b/f_check index 941a9a5c4d..997e023931 100644 --- a/f_check +++ b/f_check @@ -97,7 +97,7 @@ if ($compiler eq "") { if ($data =~ /Intel/) { $vendor = INTEL; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($data =~ /Sun Fortran/) { @@ -127,7 +127,7 @@ if ($compiler eq "") { # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; - if ($data =~ /zho_ge__/) { + if ($data =~ / zho_ge__/) { $need2bu = 1; } } @@ -155,7 +155,7 @@ if ($compiler eq "") { if ($compiler =~ /ifort/) { $vendor = INTEL; $bu = "_"; - $openmp = "-openmp"; + $openmp = "-fopenmp"; } if ($compiler =~ /pathf/) { From 193f8356622c85bd494931e54f7efe379e296f88 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 May 2018 12:34:09 +0200 Subject: [PATCH 50/61] Change -openmp to -fopenmp for ifort entry as well --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 3a3e9f5100..1fe7d9d3d1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -713,7 +713,7 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -openmp +FCOMMON_OPT += -fopenmp endif endif From d7d950fcf29c30f6611a247cf8fde4a518286b41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 May 2018 13:15:42 +0200 Subject: [PATCH 51/61] LAPACKE fixes from lapack PR249 Copied from Reference-LAPACK/lapack#249, this fixes out-of-bounds memory accesses in the nancheck calls of the LAPACKE lacgv, lassq,larfg,larfb,larfx and mtr functions --- lapack-netlib/LAPACKE/src/lapacke_clacgv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_clarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_classq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cupmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_dlarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_dlassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dopmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_slarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_slassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sopmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlacgv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfb.c | 41 ++++++++++++---------- lapack-netlib/LAPACKE/src/lapacke_zlarfg.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfx.c | 4 ++- lapack-netlib/LAPACKE/src/lapacke_zlassq.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zupmtr.c | 2 +- 26 files changed, 122 insertions(+), 94 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c index 0014906eda..9a77c8ec0e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_clacgv( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index 18e24509d6..3aeb0d7e43 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); return -8; } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c index 0381a42bc8..9e852a4065 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_clarfg( lapack_int n, lapack_complex_float* alpha, if( LAPACKE_c_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c index 977e283e15..786c214128 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_clarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_c_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_c_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_c_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_classq.c b/lapack-netlib/LAPACKE/src/lapacke_classq.c index b8f231dbb8..e4d746c5a1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_classq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_classq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_classq( lapack_int n, lapack_complex_float* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_c_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_c_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index 1864c41217..d9fb2dca0f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c index 51f6d82762..ba026ae68a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_cupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 55c26f4b6b..a1f49dde15 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); return -8; } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c index 0f627b3233..df401c41d9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_dlarfg( lapack_int n, double* alpha, double* x, if( LAPACKE_d_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c index ab4a58e76b..7b7b7201ec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const double* v, double tau, double* c, lapack_int ldc, double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_dlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_d_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_d_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_d_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c index a564240d49..0e096b6d47 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_dlassq( lapack_int n, double* x, lapack_int incx, double* sca #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_d_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_d_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c index 93d3d3d303..7fbfb11fd7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_dopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index 05e4c57c89..db75a66094 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 72fa75ef10..0ebdc931ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -50,16 +50,21 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -69,8 +74,8 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -78,23 +83,23 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); return -8; } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c index 2952773876..ea9a835754 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfg.c @@ -42,7 +42,7 @@ lapack_int LAPACKE_slarfg( lapack_int n, float* alpha, float* x, if( LAPACKE_s_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c index 426137815d..c2b797a98c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfx.c @@ -37,6 +37,7 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, lapack_int n, const float* v, float tau, float* c, lapack_int ldc, float* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfx", -1 ); return -1; @@ -50,7 +51,8 @@ lapack_int LAPACKE_slarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_s_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_s_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_s_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slassq.c b/lapack-netlib/LAPACKE/src/lapacke_slassq.c index 668289e18e..3e265e3591 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slassq.c @@ -38,7 +38,7 @@ lapack_int LAPACKE_slassq( lapack_int n, float* x, lapack_int incx, float* scale #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_s_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_s_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_s_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c index 333789837c..bf8eed4f9e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sopmtr.c @@ -56,7 +56,7 @@ lapack_int LAPACKE_sopmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 5a9d441386..9f0e9fddf5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c index 3b1130ba5d..cd412dc24a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacgv.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlacgv( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-1)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 6ea4960f3f..4fc2eb0ab1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -51,16 +51,21 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); + lapack_int lrv, lcv; /* row, column stride */ + if( matrix_layout == LAPACK_COL_MAJOR ) { + lrv = 1; + lcv = ldv; + } else { + lrv = ldv; + lcv = 1; + } + ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + + nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : + ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : + ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -13; } @@ -70,8 +75,8 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, + &v[k*lrv], ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > nrows_v ) { @@ -79,23 +84,23 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct return -8; } if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*ldv], ldv ) ) + &v[(nrows_v-k)*lrv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) return -9; } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, &v[k], - ldv ) ) + if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, + &v[k*lrv], ldv ) ) return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { + } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { if( k > ncols_v ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); return -8; } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, &v[ncols_v-k], - ldv ) ) + if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, + &v[(ncols_v-k)*lcv], ldv ) ) return -9; if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) return -9; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c index 14e587fcc1..a566a08cb5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfg.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_zlarfg( lapack_int n, lapack_complex_double* alpha, if( LAPACKE_z_nancheck( 1, alpha, 1 ) ) { return -2; } - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n-1, x, incx ) ) { return -3; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c index 1dd1f5204e..b4ebf727e8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfx.c @@ -38,6 +38,7 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work ) { + lapack_int lv; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfx", -1 ); return -1; @@ -51,7 +52,8 @@ lapack_int LAPACKE_zlarfx( int matrix_layout, char side, lapack_int m, if( LAPACKE_z_nancheck( 1, &tau, 1 ) ) { return -6; } - if( LAPACKE_z_nancheck( m, v, 1 ) ) { + lv = (LAPACKE_lsame( side, 'l' ) ? m : n); + if( LAPACKE_z_nancheck( lv, v, 1 ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c index a218c9b62d..b8972b9749 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlassq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlassq.c @@ -39,7 +39,7 @@ lapack_int LAPACKE_zlassq( lapack_int n, lapack_complex_double* x, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input vector `x` and in/out scalars `scale` and `sumsq` for NaNs */ - if( LAPACKE_z_nancheck( 1+(n-2)*ABS(incx), x, incx ) ) { + if( LAPACKE_z_nancheck( n, x, incx ) ) { return -2; } if( LAPACKE_d_nancheck( 1, scale, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c index f8936cd5ad..4333854409 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -10; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -9; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c index d735c5561e..80bbd95295 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zupmtr.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zupmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { return -9; } - if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) { + if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) { return -8; } } From 3716267124854b4da70b6ddadea5084c66971648 Mon Sep 17 00:00:00 2001 From: Zhiyong Dang Date: Fri, 11 May 2018 12:15:08 +0800 Subject: [PATCH 52/61] Change _STDC_VERSION__ to __STDC_VERSION__ Change-Id: Id3fa4e8d9eedd4ef7230df69b611e7f397301a42 --- driver/level3/level3_gemm3m_thread.c | 2 +- driver/level3/level3_syrk_threaded.c | 2 +- driver/level3/level3_thread.c | 2 +- driver/others/blas_server_omp.c | 8 ++++---- lapack/getrf/getrf_parallel.c | 10 +++++----- lapack/getrf/potrf_parallel.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index f5e5bca1e1..4903aa5bd1 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index d1c476f003..574f825b0f 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 47b20f7fa6..4ab1ee8cc0 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 868db3b1d3..cc00092cd6 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,7 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L #ifndef _Atomic #define _Atomic volatile #endif @@ -57,7 +57,7 @@ int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #else static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -322,7 +322,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Bool inuse = false; if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { #else @@ -347,7 +347,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ exec_threads(&queue[i], buf_index); } -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L atomic_store(&blas_buffer_inuse[buf_index], false); #else blas_buffer_inuse[buf_index] = false; diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 4e742b9940..591ce4a99f 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -119,7 +119,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; #else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; @@ -201,7 +201,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile @@ -246,7 +246,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; BLASLONG jw; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; #else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; @@ -452,7 +452,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile @@ -728,7 +728,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c index 104022dd9d..c2fee6bd1a 100644 --- a/lapack/getrf/potrf_parallel.c +++ b/lapack/getrf/potrf_parallel.c @@ -101,7 +101,7 @@ static FLOAT dm1 = -1.; #endif typedef struct { -#if _STDC_VERSION__ >= 201112L +#if __STDC_VERSION__ >= 201112L _Atomic #else volatile From 53457f222fa6b553be313324911ea9f5c97e1db7 Mon Sep 17 00:00:00 2001 From: "zhiyong.dang" Date: Fri, 11 May 2018 00:13:16 -0700 Subject: [PATCH 53/61] move _Atomic define to common.h --- common.h | 6 ++++++ driver/others/blas_server_omp.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common.h b/common.h index 86c33b2fd2..123e3dee78 100644 --- a/common.h +++ b/common.h @@ -649,6 +649,12 @@ int omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif +#if (__STDC_VERSION__ >= 201112L) +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index cc00092cd6..fccdb43205 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,12 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#if __STDC_VERSION__ >= 201112L -#ifndef _Atomic -#define _Atomic volatile -#endif -#include -#endif #include #include #include From 41ae8e8d677cd85ddd15ad5be723bafcd88a9d62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 May 2018 12:11:38 +0200 Subject: [PATCH 54/61] Add threading and OpenMP information to output For #1416 and #1529, more information about the options OpenBLAS was built with is needed. Additionally we may want to add this data to the openblas.pc file (but not all projects use pkgconfig, and as far as I am aware the cmake module for accessing it does not make such "private" declarations available) --- driver/others/openblas_get_config.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7d041b9077..87a27712f5 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -54,6 +54,9 @@ static char* openblas_config_str="" #ifdef NO_AFFINITY "NO_AFFINITY " #endif +#ifdef USE_OPENMP + "USE_OPENMP " +#endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif @@ -61,18 +64,23 @@ static char* openblas_config_str="" #ifdef DYNAMIC_ARCH char *gotoblas_corename(); -static char tmp_config_str[256]; #endif +static char tmp_config_str[256]; +int openblas_get_parallel(); char* CNAME() { -#ifndef DYNAMIC_ARCH - return openblas_config_str; -#else +char tmpstr[20]; strcpy(tmp_config_str, openblas_config_str); +#ifdef DYNAMIC_ARCH strcat(tmp_config_str, gotoblas_corename()); - return tmp_config_str; #endif +if (openblas_get_parallel() == 0) + sprintf(tmpstr, " SINGLE_THREADED"); +else + snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER); + strcat(tmp_config_str, tmpstr); + return tmp_config_str; } @@ -83,3 +91,4 @@ char* openblas_get_corename() { return gotoblas_corename(); #endif } + From a07843bc938b0595f2a453e9ae276c5a781504f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 May 2018 22:11:27 +0200 Subject: [PATCH 55/61] Overwrite any pre-existing openblas.pc rather than append to it --- Makefile.install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 9ce5ceae6e..6176fb0ff1 100644 --- a/Makefile.install +++ b/Makefile.install @@ -96,7 +96,7 @@ endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From 7d7564568cb6641e78b17e2d31c29a2162ce6db1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 May 2018 00:09:35 +0200 Subject: [PATCH 56/61] Add build-time configuration options to pkgconfig file --- cmake/openblas.pc.in | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 113ba85262..35973b09bd 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,6 +1,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ From eb9b021d3890429a41823dc3d90eb0d11c0a6d6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 May 2018 00:10:15 +0200 Subject: [PATCH 57/61] Add build-time configuration options to pkgconfig file --- Makefile.install | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 9ce5ceae6e..21c3c9e227 100644 --- a/Makefile.install +++ b/Makefile.install @@ -96,8 +96,9 @@ endif #Generating openblas.pc @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From 893b535540bb71ad766ca7d56d819630e91a8715 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 May 2018 14:42:12 +0200 Subject: [PATCH 58/61] Use correct data type for initializers of v2f64, v4f32 Fixes #1561 --- kernel/mips/dgemv_n_msa.c | 4 ++-- kernel/mips/sgemv_n_msa.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c index 82c3a96cfe..380b94d066 100644 --- a/kernel/mips/dgemv_n_msa.c +++ b/kernel/mips/dgemv_n_msa.c @@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v2f64 v_alpha; - v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0; + v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0}; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; - v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0}; v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c index e1ecb54739..66e3adebfc 100644 --- a/kernel/mips/sgemv_n_msa.c +++ b/kernel/mips/sgemv_n_msa.c @@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0; + v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0}; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; - v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0; + v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0}; v_alpha = COPY_FLOAT_TO_VECTOR(alpha); From 7a7619af6df1fc7754cd30ff8310e1c24bcee7bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 11:40:08 +0200 Subject: [PATCH 59/61] Revert changes from PR#1419 at least one of these changes apparently is an oversimplification, leading to TRMM breakage on some platforms as observed in #1563 --- kernel/generic/trmm_ltcopy_2.c | 32 +++++++++++----- kernel/generic/trmm_utcopy_16.c | 67 ++++++++++++++++++--------------- kernel/generic/trmm_utcopy_2.c | 37 +++++++++++------- kernel/generic/trmm_utcopy_4.c | 41 +++++++++++--------- kernel/generic/trsm_ltcopy_4.c | 2 +- kernel/generic/ztrmm_ltcopy_2.c | 46 ++++++++++++++++++---- kernel/generic/ztrsm_utcopy_1.c | 2 +- kernel/generic/ztrsm_utcopy_2.c | 4 +- 8 files changed, 147 insertions(+), 84 deletions(-) diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c index 60cdeed1c3..e9ad45fa0d 100644 --- a/kernel/generic/trmm_ltcopy_2.c +++ b/kernel/generic/trmm_ltcopy_2.c @@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X > posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else -#ifdef UNIT if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; } else { +#ifdef UNIT + data02 = *(ao1 + 1); b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; } -#endif - b[ 1] = *(ao1 + 1); - b += 2; } posY += 2; @@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c index 12642e7db6..b83989f55d 100644 --- a/kernel/generic/trmm_utcopy_16.c +++ b/kernel/generic/trmm_utcopy_16.c @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; @@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a13 += i; a14 += i; a15 += i; - a16 += i; */ + a16 += i; b += 16 * i; } else if (X > posY) { @@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; - a08 += i; */ + a08 += i; b += 8 * i; } else if (X > posY) { @@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 8; } - /* a02 += i * lda; + a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; - a08 += i * lda; */ + a08 += i * lda; } else { #ifdef UNIT b[ 0] = ONE; @@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; - a04 += i; */ + a04 += i; b += 4 * i; } else if (X > posY) { @@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += lda; b += 4; } - /* a02 += lda; + a02 += lda; a03 += lda; - a04 += lda; */ + a04 += lda; } else { #ifdef UNIT @@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { a01 ++; a02 ++; - } else { -#ifdef UNIT + b += 2; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; } else { +#ifdef UNIT b[ 0] = ONE; - } + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); #endif - b[ 1] = *(a01 + 1); - } - b += 2; + b += 2; + } } posY += 2; } @@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i > 0) { do { if (X < posY) { - a01 ++; - } else { -#ifdef UNIT + a01 += 1; + b ++; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + a01 += lda; + b ++; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + b[ 0] = *(a01 + 0); #endif - a01 += lda; - } - b ++; - X ++; - i --; + a01 += lda; + b ++; + } + + X += 1; + i --; } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c index 75076c3822..ae4a19e327 100644 --- a/kernel/generic/trmm_utcopy_2.c +++ b/kernel/generic/trmm_utcopy_2.c @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X < posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else if (X > posY) { @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = ZERO; #endif - // ao1 += lda; + ao1 += lda; b += 2; } } @@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (m > 0) { do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { #ifdef UNIT - if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT - } else { - b[ 0] = ONE; - } + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b ++; - ao1 += lda; - X ++; + b += 1; + ao1 += lda; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c index e5844094e5..441f7338b5 100644 --- a/kernel/generic/trmm_utcopy_4.c +++ b/kernel/generic/trmm_utcopy_4.c @@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { if (m & 2) { - /* ao1 += 2; + ao1 += 2; ao2 += 2; ao3 += 2; - ao4 += 2; */ + ao4 += 2; b += 8; } if (m & 1) { - /* ao1 += 1; + ao1 += 1; ao2 += 1; ao3 += 1; - ao4 += 1; */ + ao4 += 1; b += 4; } @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data08; ao1 += 2 * lda; - // ao2 += 2 * lda; + ao2 += 2 * lda; b += 8; } @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data03; b[ 3] = data04; - // ao1 += lda; + ao1 += lda; b += 4; } @@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i) { if (X < posY) { - // ao1 += 2; + ao1 += 2; b += 2; } else if (X > posY) { @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { + b += 1; ao1 += 1; - } else { -#ifdef UNIT + } else if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - } - b ++; - X ++; + ao1 += lda; + b += 1; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c index 07bb137d4a..12043eb335 100644 --- a/kernel/generic/trsm_ltcopy_4.c +++ b/kernel/generic/trsm_ltcopy_4.c @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } a1 += 2 * lda; - // a2 += 2 * lda; + a2 += 2 * lda; b += 8; ii += 2; diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c index 7969f4f3db..457890ceb1 100644 --- a/kernel/generic/ztrmm_ltcopy_2.c +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { -#ifdef UNIT + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); - b[ 1] = *(ao1 + 1); -#ifdef UNIT + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + b[ 0] = ONE; b[ 1] = ZERO; - } + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; #endif - b += 4; + b += 4; + } } posY += 2; @@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 0e33a7d181..08f85e8911 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0; + FLOAT data01, data02; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index c34d741ee8..387bb2532a 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0, data03, data04; - FLOAT data05, data06, data07 = 0.0, data08 = 0.0; + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; From 95f7f0229cf277d111206ae6841769d578e45580 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 18:43:59 +0200 Subject: [PATCH 60/61] Remove extraneous brace from previous commit --- kernel/mips/dot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/dot.c b/kernel/mips/dot.c index cbd3efc642..89c9f80f6b 100644 --- a/kernel/mips/dot.c +++ b/kernel/mips/dot.c @@ -42,7 +42,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n) { #if defined(DSDOT) - dot += (double)(y[iy] * (double)x[ix] ; + dot += (double)y[iy] * (double)x[ix] ; #else dot += y[iy] * x[ix]; #endif From 82012b960b1c9427957bd87cc53f860823eeb674 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 May 2018 20:30:03 +0200 Subject: [PATCH 61/61] Revert " Switch mips32 target to USE_TRMM to fix complex TRMM" ... as it was just a silly workaround for the issue seen in #1563, caused by #1419 --- kernel/Makefile.L3 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4d2999b678..0664263967 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -20,10 +20,6 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif -ifeq ($(ARCH), mips) -USE_TRMM = 1 -endif - ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif