-
Notifications
You must be signed in to change notification settings - Fork 355
Optimization2
Jianyu Huang edited this page Aug 11, 2016
·
5 revisions
Copy the contents of file MMult1.c
into a file named MMult2.c
and change the contents:
from
/* Create macros so that the matrices are stored in column-major order */
#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]
/* Routine for computing C = A * B + C */
void AddDot( int, double *, int, double *, double * );
void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=1 ){ /* Loop over the columns of C */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
/* Update the C( i,j ) with the inner product of the ith row of A
and the jth column of B */
AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
}
}
}
/* Create macro to let X( i ) equal the ith element of x */
#define X(i) x[ (i)*incx ]
void AddDot( int k, double *x, int incx, double *y, double *gamma )
{
/* compute gamma := x' * y + gamma with vectors x and y of length n.
Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
*/
int p;
for ( p=0; p<k; p++ ){
*gamma += X( p ) * y[ p ];
}
}
to
/* Create macros so that the matrices are stored in column-major order */
#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]
/* Routine for computing C = A * B + C */
void AddDot( int, double *, int, double *, double * );
void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;
for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=1 ){ /* Loop over the rows of C */
/* Update the C( i,j ) with the inner product of the ith row of A
and the jth column of B */
AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
/* Update the C( i,j+1 ) with the inner product of the ith row of A
and the (j+1)th column of B */
AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) );
/* Update the C( i,j+2 ) with the inner product of the ith row of A
and the (j+2)th column of B */
AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) );
/* Update the C( i,j+3 ) with the inner product of the ith row of A
and the (j+1)th column of B */
AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) );
}
}
}
/* Create macro to let X( i ) equal the ith element of x */
#define X(i) x[ (i)*incx ]
void AddDot( int k, double *x, int incx, double *y, double *gamma )
{
/* compute gamma := x' * y + gamma with vectors x and y of length n.
Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
*/
int p;
for ( p=0; p<k; p++ ){
*gamma += X( p ) * y[ p ];
}
}
Change the first lines in the makefile
to
OLD := MMult1
NEW := MMult2
make run
octave:3> PlotAll % this will create the plot
This time the performance graph will look something like
Still, no performance benefit. What we are doing is slowly changing the code into one where there will be a performance benefit.