QUARK  0.9.0
QUARK-QUeuingAndRuntimeforKernels
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
matmul_example.c
Go to the documentation of this file.
1 /* This example shows how to setup a very simple QUARK program. The
2  * dependencies involved in matrix multiplication are minimal, however
3  * you can see how QUARK can easily enable parallelism for a tile
4  * based code.
5  *
6  * In this example the calls to the tile based matmul routine are
7  * replaced with calls to a routine that inserts tasks into the QUARK
8  * runtime. However, the overall tile based algorithm does not
9  * change.
10  *
11  *
12  * matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j],
13  * &Cblk[NB*NB*i + NB*NB*BB*j], NB);
14  *
15  * matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k],
16  * &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB);
17  *
18  * Using this runtime, speedups for multiple threads can easily be
19  * enabled.
20 */
21 
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <math.h>
27 #include <sys/time.h>
28 #include <string.h>
29 
30 #include "quark.h"
31 
32 /* This is the simple three loops version of matmul */
33 void matmul ( double *A, double *B, double *C, int NB )
34 {
35  int i, j, k;
36  double aij;
37 
38  for (j=0; j<NB; j++)
39  for ( k=0; k<NB; k++)
40  for ( i=0; i<NB; i++)
41  C[i+j*NB] += A[i+k*NB] * B[k+j*NB];
42 }
43 
44 /* Create a task wrapper to matmul, usable by the QUARK runtime.
45  * Basically, you need to unpack the arguments from QUARK and call the
46  * routine */
47 void matmul_quark_task( Quark *quark )
48 {
49  double *A, *B, *C;
50  int NB;
51  quark_unpack_args_4( quark, A, B, C, NB );
52  matmul( A, B, C, NB );
53 }
54 
55 
56 /* Create a call that will insert a matmul task into the QUARK
57  * runtime. Later, when dependencies are statisfied, the runtime will
58  * execute this task. The arguments to matmul are specified and
59  * passed to QUARK here. */
60 void matmul_quark_call( Quark *quark, double *A, double *B, double *C, int NB )
61 {
63  sizeof(double)*NB*NB, A, INPUT,
64  sizeof(double)*NB*NB, B, INPUT,
65  sizeof(double)*NB*NB, C, INOUT,
66  sizeof(int), &NB, VALUE,
67  0 );
68 }
69 
70 /* Print a matrix */
71 void matrix_print( char *label, double *A, int N )
72 {
73  int i, j;
74  printf("%s (intial 4x4 submatrix)\n", label);
75  for (i=0; i<N && i<4; i++) {
76  for (j=0; j<N && j<4; j++) {
77  printf("%5.2f ", A[i + j*N] );
78  }
79  printf("\n");
80  }
81 }
82 
83 /* Compare two matricies */
84 int matrix_compare( double *A, double *A2, int N )
85 {
86  int i, j, nerr=0;
87  double diff;
88  for (i=0; i<N; i++) {
89  for (j=0; j<N; j++) {
90  diff = fabs( A[i*N+j] - A2[i*N+j] );
91  if ( diff > 0.0001 ) {
92  //printf("Error: [%d, %d] %5.1f != %5.1f (%f) \n", i, j, A[i*N + j], A2[i*N + j], diff );
93  nerr++;
94  }
95  }
96  }
97  return nerr;
98 }
99 
100 /* Subtract two gettimeofday timevals */
101 double timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y )
102 {
103  /* Perform the carry for the later subtraction by updating y. */
104  if (x->tv_usec < y->tv_usec) {
105  int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
106  y->tv_usec -= 1000000 * nsec;
107  y->tv_sec += nsec;
108  }
109  if (x->tv_usec - y->tv_usec > 1000000) {
110  int nsec = (x->tv_usec - y->tv_usec) / 1000000;
111  y->tv_usec += 1000000 * nsec;
112  y->tv_sec -= nsec;
113  }
114  /* Compute the time remaining to wait.
115  tv_usec is certainly positive. */
116  result->tv_sec = x->tv_sec - y->tv_sec;
117  result->tv_usec = x->tv_usec - y->tv_usec;
118  /* Return 1 if result is negative. */
119  return (double)result->tv_sec + (double)result->tv_usec/1000000.0;
120 }
121 
122 /* Convert from standard data layout to block data layout */
123 void std_to_bdl( double *A, double *Ablk, int N, int NB )
124 {
125  int X, Y, x, y;
126  int BB = N/NB;
127  for (X = 0; X < BB; X++)
128  for (Y = 0; Y < BB; Y++)
129  for (x = 0; x < NB; x++)
130  for (y = 0; y < NB; y++)
131  Ablk[Y*NB*NB + y + X*NB*NB*BB + x*NB] = A[Y*NB + y + X*NB*N + x*N];
132 }
133 
134 /* Convert from block to standard data layout */
135 void bdl_to_std( double *A, double *Ablk, int N, int NB )
136 {
137  int X, Y, x, y;
138  int BB = N/NB;
139  for (X = 0; X < BB; X++)
140  for (Y = 0; Y < BB; Y++)
141  for (x = 0; x < NB; x++)
142  for (y = 0; y < NB; y++) {
143  A[Y*NB + y + X*NB*N + x*N] = Ablk[Y*NB*NB + y + X*NB*NB*BB + x*NB];
144  }
145 }
146 
147 
148 /* Try various ways to do matmul and time them. Tiled algorithms
149  * running serially; multi-threaded QUARK runtime with tiled
150  * algorithms; and direct serial computation over standard layout. */
151 int main_algorithm(int NB, int N, int THREADS)
152 {
153  int i, j, k, nerr=0;
154  int BB = N/NB;
155  double *A = (double*)malloc(N*N*sizeof(double));
156  double *Ablk = (double*)malloc(N*N*sizeof(double));
157  double *B = (double*)malloc(N*N*sizeof(double));
158  double *Bblk = (double*)malloc(N*N*sizeof(double));
159  double *C_direct = (double*)malloc(N*N*sizeof(double));
160  double *C = (double*)malloc(N*N*sizeof(double));
161  double *Cblk = (double*)malloc(N*N*sizeof(double));
162  double *C_quark = (double*)malloc(N*N*sizeof(double));
163  double *C_quark_blk = (double*)malloc(N*N*sizeof(double));
164  struct timeval tstart, tend, tdiff;
165  double t_blk=0, t_quark=0, t_direct=0;
166 
167  // Initialize
168  for (i = 0; i < N; i++) {
169  for (j = 0; j < N; j++) {
170  A[i+j*N] = (double)1.0+i;
171  B[i+j*N] = (double)2.0+i+j;
172  C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0;
173  }
174  }
175 
176  matrix_print("Printing A", A, N);
177  matrix_print("Printing B", B, N);
178  matrix_print("Printing C before computation", C, N);
179 
180  // Move from F77 to BDL
181  std_to_bdl( A, Ablk, N, NB );
182  std_to_bdl( B, Bblk, N, NB );
183  std_to_bdl( C, Cblk, N, NB );
184  std_to_bdl( C_quark, C_quark_blk, N, NB );
185 
186  /* ORIGINAL TILED ROUTINE */
187  /* This is the code for the serial tile-by-tile multiplication */
188  printf("Doing matrix multiplication using serial tile-by-tile algorithm\n");
189  gettimeofday( &tstart, NULL );
190  for (i = 0; i < BB; i++)
191  for (j = 0; j < BB; j++)
192  for (k = 0; k < BB; k++)
193  matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB);
194  gettimeofday( &tend, NULL );
195  t_blk = timeval_subtract( &tdiff, &tend, &tstart );
196  printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
197  bdl_to_std( C, Cblk, N, NB );
198  matrix_print("Printing C produced by serial tile-algorithm after computation", C, N);
199  printf("\n");
200 
201  /* QUARK PARALLEL TILED ROUTINE */
202  /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */
203  printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n");
204  Quark *quark = QUARK_New(THREADS);
205  gettimeofday( &tstart, NULL );
206  for (i = 0; i < BB; i++)
207  for (j = 0; j < BB; j++)
208  for (k = 0; k < BB; k++)
209  matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB);
210  QUARK_Barrier( quark );
211  gettimeofday( &tend, NULL );
212  t_quark = timeval_subtract( &tdiff, &tend, &tstart );
213  printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
214  QUARK_Delete(quark);
215  bdl_to_std( C_quark, C_quark_blk, N, NB );
216  matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N);
217  printf("\n");
218 
219  /* DIRECT COMPUTATION OVER STANDARD LAYOUT */
220  /* Compute direct C if desired */
221  printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n");
222  gettimeofday( &tstart, NULL );
223  matmul ( A, B, C_direct, N );
224  gettimeofday( &tend, NULL );
225  t_direct = timeval_subtract( &tdiff, &tend, &tstart );
226  printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) );
227  matrix_print("Printing C produced by direct matmul after computation", C_direct, N);
228  printf("\n");
229 
230  /* Check for errors */
231  printf("Comparing result matrices (direct versus QUARK)\n");
232  nerr = matrix_compare( C_direct, C_quark, N );
233  printf("Number of differences: %d\n", nerr);
234  printf("\n");
235 
236  printf("Summary of time taken\n");
237  printf("%-12s %-12s %-12s (%d threads)\n", "BigLoops", "SerialBlocks", "QUARKBlocks", THREADS);
238  printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark);
239 
240  free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk);
241  return 0;
242 }
243 
244 /* Grab command line args and launch the matmul tests */
245 int main (int argc, char **argv)
246 {
247  int NB, N, THREADS;
248  if (argc != 4) {
249  printf( "Usage: %s NB N THREADS\n", argv[0] );
250  printf( "Usage: Note: N / NB must be an integer\n" );
251  printf( "Usage: Assuming a simple test run, with the following parameters\n" );
252  printf( "%s 200 800 2\n", argv[0] );
253  NB = 200;
254  N = 800;
255  THREADS = 2;
256  } else {
257  NB = atoi(argv[1]);
258  N = atoi(argv[2]);
259  THREADS = atoi(argv[3]);
260  }
261  assert( N % NB == 0 );
262  main_algorithm(NB, N, THREADS);
263 }