#include <iostream.h>

#include <timeit.h>

/*
   This example demonstrates how loop unrolling
   may improve performance.  old_loop is a generic
   loop that adds the contents of an array and
   then prints the value out, as a check.
   new_loop is the unrolled loop.  Note that
   the printout may be necessary as some compilers
   may optimize functions competely away
   which do work, but don't use the results of that
   work.
 */

 /*
    NUM is just a 'large' number so we have enough
    data to compute slowly enough to get good time
    measurements.
  */
const int NUM = 30912;
 
float x[NUM];

// prototypes
void setup_data();
void old_loop();
void new_loop();

// Small function to fill array
void setup_data()
{
   int i;

   cout << "NUM = " << NUM << endl;
   for ( i=0; i<NUM; i++ )
      x[i] = i;
}

// Code the old way, non-unrolled
void old_loop()
{
  int i;
  float sum = 0;

  for ( i=0; i<NUM; i++ )
     sum += x[i];

  cout << "sum = " << sum << endl;
}

// Code the new way using loop unrolling
void new_loop() 
{
  int i,ii;
  float sum = 0;

  /* we chose 4 as the unrolling factor. unrolling
   * by 4 is an arbitrary choice -- we could have
   * chosen 4, 5, 8, etc. at some point, though,
   * adding more unrolled statements in the loop
   * stops adding much performance gain.
   */

  ii = NUM%4;
  for (i=0; i<ii; i++) 
      sum += x[i];
  for (i=ii; i<NUM; i+=4) 
  {
     sum += x[i];   
     sum += x[i+1];
     sum += x[i+2]; 
     sum += x[i+3];
  }

  cout << "sum = " << sum << endl;
}

int main( void )
{
   timeobj *tt = timeit_new();

   // initialize our data
   setup_data();

   // time the non-unrolled loop
   timeit_start( tt );
   old_loop();
   timeit_stop( tt );
   cout << "normal loop (secs):" << timeit_getf( tt, timeit_seconds ) << endl;

   // time the unrolled loop
   timeit_start( tt );
   
   new_loop();
   timeit_stop( tt );
   cout << "unrolled loop (secs):" << timeit_getf( tt, timeit_seconds ) << endl;

   // cleanup
   timeit_delete( tt );

   return( 0 );
}
