forked from xiaoyi018/simple_gemm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.c
158 lines (130 loc) · 4.92 KB
/
benchmark.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <stdlib.h> // For: exit, drand48, malloc, free, NULL, EXIT_FAILURE
#include <stdio.h> // For: perror
#include <string.h> // For: memset
#include <float.h> // For: DBL_EPSILON
#include <math.h> // For: fabs
#include <omp.h>
#ifdef GETTIMEOFDAY
#include <sys/time.h> // For struct timeval, gettimeofday
#else
#include <time.h> // For struct timespec, clock_gettime, CLOCK_MONOTONIC
#endif
/* reference_gemm wraps a call to the BLAS-3 routine GEMM, via the standard FORTRAN interface - hence the reference semantics. */
#define GEMM sgemm_
extern void GEMM (char*, char*, int*, int*, int*, float*, float*, int*, float*, int*, float*, float*, int*);
void reference_gemm (int N, float ALPHA, float* A, float* B, float* C)
{
char TRANSA = 'N';
char TRANSB = 'N';
int M = N;
int K = N;
float BETA = 1.;
int LDA = N;
int LDB = N;
int LDC = N;
GEMM(&TRANSA, &TRANSB, &M, &N, &K, &ALPHA, A, &LDA, B, &LDB, &BETA, C, &LDC);
}
/* Your function must have the following signature: */
extern const char* gemm_desc;
extern void square_gemm (int, float*, float*, float*);
double wall_time ()
{
#ifdef GETTIMEOFDAY
struct timeval t;
gettimeofday (&t, NULL);
return 1.*t.tv_sec + 1.e-6*t.tv_usec;
#else
struct timespec t;
clock_gettime (CLOCK_MONOTONIC, &t);
return 1.*t.tv_sec + 1.e-9*t.tv_nsec;
#endif
}
void die (const char* message)
{
perror (message);
exit (EXIT_FAILURE);
}
void fill (float* p, int n)
{
for (int i = 0; i < n; ++i)
p[i] = 2 * drand48() - 1; // Uniformly distributed over [-1, 1]
}
void absolute_value (float *p, int n)
{
for (int i = 0; i < n; ++i)
p[i] = fabs (p[i]);
}
/* The benchmarking program */
int main (int argc, char **argv)
{
/* Test sizes should highlight performance dips at multiples of certain powers-of-two */
int test_sizes[] =
/* Multiples-of-32, +/- 1. for final benchmarking. */
{31,32,33,63,64,65,95,96,97,127,128,129,159,160,161,191,192,193,223,224,225,255,256,257,287,288,289,319,320,321,351,352,353,383,384,385,415,416,417,447,448,449,479,480,481,511,512,513,543,544,545,575,576,577,607,608,609,639,640,641,671,672,673,703,704,705,735,736,737,767,768,769,799,800,801,831,832,833,863,864,865,895,896,897,927,928,929,959,960,961,991,992,993,1023,1024,1025};
/* A representative subset of the first list for initial test. Currently uncommented. */
// { 31, 32, 96, 97, 127, 128, 129, 191, 192, 229, 255, 256, 257,
// 319, 320, 321, 417, 479, 480, 511, 512, 639, 640, 767, 768, 769 };
int nsizes = sizeof(test_sizes)/sizeof(test_sizes[0]);
/* assume last size is also the largest size */
int nmax = test_sizes[nsizes-1];
/* allocate memory for all problems */
float* buf = NULL;
buf = (float*) malloc (3 * nmax * nmax * sizeof(float));
if (buf == NULL) die ("failed to allocate largest problem size");
/* For each test size */
double res = 0.0;
double count = 0.0;
for (int isize = 0; isize < sizeof(test_sizes)/sizeof(test_sizes[0]); ++isize)
{
/* Create and fill 3 random matrices A,B,C*/
int n = test_sizes[isize];
float* A = buf + 0;
float* B = A + nmax*nmax;
float* C = B + nmax*nmax;
fill (A, n*n);
fill (B, n*n);
fill (C, n*n);
/* Measure performance (in Gflops/s). */
/* Time a "sufficiently long" sequence of calls to reduce noise */
double Gflops_s, seconds = -1.0;
double timeout = 0.1; // "sufficiently long" := at least 1/10 second.
int n_iterations = 0;
for (n_iterations = 1; seconds < timeout; n_iterations *= 2)
{
/* Warm-up */
square_gemm (n, A, B, C);
/* Benchmark n_iterations runs of square_gemm */
seconds = -wall_time();
for (int it = 0; it < n_iterations; ++it)
square_gemm (n, A, B, C);
seconds += wall_time();
/* compute Mflop/s rate */
Gflops_s = 2.e-9 * n_iterations * n * n * n / seconds;
}
printf ("Size: %d\tGflop/s: %.3g (%d iter, %.3f seconds)\n", n, Gflops_s, n_iterations, seconds);
res += Gflops_s;
count += 1;
/* Ensure that error does not exceed the theoretical error bound. */
/* C := A * B, computed with square_gemm */
memset (C, 0, n * n * sizeof(float));
square_gemm (n, A, B, C);
/* Do not explicitly check that A and B were unmodified on square_gemm exit
* - if they were, the following will most likely detect it:
* C := C - A * B, computed with reference_gemm */
reference_gemm(n, -1., A, B, C);
/* A := |A|, B := |B|, C := |C| */
absolute_value (A, n * n);
absolute_value (B, n * n);
absolute_value (C, n * n);
/* C := |C| - 3 * e_mach * n * |A| * |B|, computed with reference_gemm */
reference_gemm (n, -3.*FLT_EPSILON*n, A, B, C);
/* If any element in C is positive, then something went wrong in square_gemm */
for (int i = 0; i < n * n; ++i)
if (C[i] > 0)
die("*** FAILURE *** Error in matrix multiply exceeds componentwise error bounds.\n" );
}
res /= count;
printf("Average %lf \n",res);
free (buf);
return 0;
}