#include <iostream.h>
#include <stdlib.h>
#include <timeit.h>

/* Cache blocking example.  Original algorithm is in
   the function normal.  I've used loop unrolling
   as a technique to block the indices in the function
   normal.

   DIM needs to be even since we unroll by 2 below.
   It would be nice if it were larger than the
   the size of the machine's cache line, or else
   you may not see much difference in performance.

   Modify DIM and see what effect it has on
   the overall difference between the speed
   for the normal and blocked functions.  

*/

const int DIM = 128;

float arrayA[DIM][DIM];
float arrayB[DIM][DIM];

/* 
 * row reduction algorithm, coded conventionally 
 */
void normal()
{
   int i,j;

   /* Code the conventional way */
   for (i=0; i<DIM; i++) 
      for (j=0; j<DIM; j++) 
         arrayA[j][i] = arrayA[j][i] + arrayB[i][j];
}

/*
 * cache block the indices.  Use loop unrolling
 * technique.
 */
void blocked()
{
   int i,j;

   /* Unroll outer and inner loop */
   for (i=0; i<DIM; i+=2 )
      for (j=0; j<DIM; j+=2 ) 
      {
         arrayA[  j][  i] = arrayA[  j][  i] + arrayB[  i][  j];
         arrayA[j+1][  i] = arrayA[j+1][  i] + arrayB[  i][j+1];
         arrayA[  j][i+1] = arrayA[  j][i+1] + arrayB[i+1][  j];
         arrayA[j+1][i+1] = arrayA[j+1][i+1] + arrayB[i+1][j+1];
      }

}



main()
{
   const int LOOP = 200;
   int i;
   timeobj *tt = timeit_new();

   timeit_start( tt );
   for (i=0;i<LOOP;i++) 
   {
      normal();
   }
   timeit_stop( tt );
   cout << "Conventional method (secs):" << timeit_getf( tt, timeit_seconds ) << endl;

   timeit_start( tt );
   for (i=0;i<LOOP;i++) 
   {
      blocked();
   }
   timeit_stop( tt );
   cout << "Blocked method (secs):" << timeit_getf( tt, timeit_seconds ) << endl;

   timeit_delete( tt );
}
