/**
 * Access methods used to perform benchmarks.
 **/

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <sys/time.h>
#include "inc/mob.h"
#include "inc/routines.h"
#include "inc/genCode.h"


/* global variables externally defined */
extern struct globalSystem sys;


#define STRAIGHT 32  /* number of operation done within one loop,
                        this is done to minimize effects of tight loops */
                     /* simple macro to scale microseconds in nanos */
#define SCALE_NANOS( x, y ) ( ((double)(x) / (double)(y)) * 1000)  


/* macros used to isolate timing issues common to all routines
   if MACH_TIME is defined then a machine dependent timing routine
   will be used instead for higher resolution. */
#ifndef MACH_TIME

#define TIME_DECLARE   struct timeval start, stop;

#define TIME_START     gettimeofday( &start, NULL );

#define TIME_END( x )  gettimeofday( &stop, NULL ); \
                       x += ((stop.tv_sec * MILL + stop.tv_usec) - (start.tv_sec * MILL + start.tv_usec));

#endif




/*
  Perform benchmark for reading in small strides. 
  The strides must be small enough that at least STRAIGHT reads
  can be made without breaking memory.

  array - memory to read
  size - total size of memory to cover (bytes)
  stride - size of stride for each read (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double dataRead( char *array, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important */
  register char *read;
  register int phony;
  register unsigned int r_stride, r, s, reps, r_sets = sets;
  register char *r_array = array;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;

  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );

  /* calculate the number of inner loop repitions needed to stride the
     entire memory size at the given stride taking the fact that each
     loop does STRAIGHT number of operations */
  reps = size / (r_stride * STRAIGHT);
  
  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead */
    for( s = 0; s < r_sets; s++ ) {
      /* reset the read pointer to the start of memory less the first advance */
      read = r_array - r_stride;
      /* duplicate the operation STRAIGHT times due to problems with tight loops */
      for( r = 0; r < reps; r++ ) {
	/* advance by stride and read one byte */
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
      }    
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of "set" rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * reps * STRAIGHT ) );
}



/*
  Perform benchmark for reading in larger strides using a wrap-around
  check to limit the range of memory accessed.  The wrap-around check 
  is more expensive than the normal striding but can be used for strides
  less than STRAIGHT.

  array - memory to read
  size - total size of memory to cover (bytes)
  stride - size of stride for each read (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double dataReadWrap( char *array, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important */
  register char *read, phony, *r_array, *wrap;
  register unsigned int r_stride, s, r_sets = sets;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;

  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );
  /* assign local pointers and compute the wrap-around address */
  read = r_array = array;
  wrap = array + size;

  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead,
       duplicate the operation STRAIGHT times due to problems with tight loops */
    for( s = 0; s < r_sets; s++ ) {
      /* read one byte and advance by stride, wrapping around at end of memory */
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of "set" rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * STRAIGHT ) );
}



/*
  Perform benchmark for writing in small strides. 
  The strides must be small enough that at least STRAIGHT writes
  can be made without breaking memory.

  array - memory to write
  size - total size of memory to cover (bytes)
  stride - size of stride for each write (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double dataWrite( char *array, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important */
  register char *write;
  register unsigned int r_stride, r, s, reps, r_sets = sets;
  register char *r_array = array;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;

  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );

  /* calculate the number of inner loop repitions needed to stride the
     entire memory size at the given stride taking the fact that each
     loop does STRAIGHT number of operations */
  reps = size / (r_stride * STRAIGHT);
  
  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead */
    for( s = 0; s < r_sets; s++ ) {
      /* reset the write pointer to the start of memory less the first advance */
      write = r_array - r_stride;
      /* duplicate the operation STRAIGHT times due to problems with tight loops */
      for( r = 0; r < reps; r++ ) {
	/* advance by stride and write one byte */
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
	*(write += r_stride) = 1;
      }    
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of set rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * reps * STRAIGHT ) );
}



/*
  Perform benchmark for executing generated code sequentially or in strides. 

  code - memory with space for generated code
  size - total size of memory to cover (bytes)
  stride - size of stride for each jump (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double instExecute( char *code, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important, 
     but values passed to code macros need to be on the stack */
#ifndef REP_BLOCK
  register unsigned int s, r_sets = sets;
  void *back = &&back;
#endif
  unsigned int ops, blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;
  
  /* calculate the total number of operations performed,
     if no stride is given then operations are sequential nops */
  ops = (stride) ? (size / stride) : (size / nopBlockSize());

  /* generate code segment of correct size with requested stride */
  generateCode( code, size, stride );

  /* run for at least the requested amount of time */
  do { 
    /* record the starting time */
    TIME_START;

#ifdef REP_BLOCK
    REP_BLOCK( code, stride, sets );
#else
    /* force a minimum of loops to avoid time call overhead */
    for( s = 0; s < r_sets; ) {
    run:
      RUN_BLOCK( code, stride, back )
      /* use goto's to force label alignment */
      goto back;
      goto run;
    back:
      s++;
    }
#endif

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of set rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * ops ) );
}



/*
  Perform benchmark to alternate between executing generated code
  and reading data with small strides.

  array - memory with data to read
  code - memory with space for generated code
  size - total size of memory to cover (bytes)
  stride - size of stride for each jump (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per pair of operations, read + execute (ns) 
*/
double shareCheck( char *array, char *code, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important, 
     but values passed to code macros need to be on the stack */
  register char *read, phony;
  register unsigned int r_stride, r, s, reps, r_sets = sets;
  register char *r_array = array;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  void *back = &&back;
  TIME_DECLARE;
  
  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );

  /* calculate the number of inner loop repitions needed to stride the
     entire memory size at the given stride taking the fact that each
     loop does STRAIGHT number of operations */
  reps = size / (r_stride * STRAIGHT);

  /* generate code segment of correct size with requested stride */
  generateCode( code, size, stride );

  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead */
    for( s = 0; s < r_sets; ) {
      /* reset the read pointer to the start of memory less the first advance */
      read = r_array - r_stride;
      /* duplicate the operation STRAIGHT times due to problems with tight loops */
      for( r = 0; r < reps; r++ ) {
	/* advance by stride and read one byte */
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
	phony = *(read += r_stride);
      }    
    run:
      RUN_BLOCK( code, stride, back );
      /* use goto's to force label alignment */
      goto back;
      goto run;
    back:
      s++;
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of set rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime * 2 );
  
  /* return the average time per operation in ns,
     scaling only by the number of reads thus making the time a composite
     time rather than an average across operations */
  return( SCALE_NANOS( usecs, blocks * sets * reps * STRAIGHT ) );
}


/*
  Perform benchmark to alternate between executing generated code
  and reading data with larger strides.

  array - memory with data to read
  code - memory with space for generated code
  size - total size of memory to cover (bytes)
  stride - size of stride for each jump (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per pair of operations, read + execute (ns) 
*/
double shareCheckWrap( char *array, char *code, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important, 
     but values passed to code macros need to be on the stack */
  register char *read, phony, *r_array, *wrap;
  register unsigned int r_stride, s, r_sets = sets;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  void *back = &&back;
  TIME_DECLARE;
  
  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );
  /* assign local pointers and compute the wrap-around address */
  read = r_array = array;
  wrap = array + size;

  /* generate code segment of correct size with requested stride */
  generateCode( code, size, stride );

  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead,
       duplicate the operation STRAIGHT times due to problems with tight loops */
    for( s = 0; s < r_sets;  ) {
      /* read one byte and advance by stride, wrapping around at end of memory */
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = *(read); read += r_stride; if( read >= wrap ) read = r_array;
    run:
      RUN_BLOCK( code, stride, back );
      /* use goto's to force label alignment */
      goto back;
      goto run;
    back:
      s++;
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of set rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime * 2 );
  
  /* return the average time per operation in ns,
     scaling only by the number of reads thus making the time a composite
     time rather than an average across operations */
  return( SCALE_NANOS( usecs, blocks * sets * STRAIGHT ) );
}



/*
  Provide a control time for dataRead(). 
  This does everything that dataRead() does except actually referencing memory.
  It allows for some measure of what overhead is incurred over the actual read
  times, as well as provides a means to guage noise in the data.

  array - memory to read
  size - total size of memory to cover (bytes)
  stride - size of stride for each read (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double ctrlRead( char *array, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important */
  register char *read, *phony;
  register unsigned int r_stride, r, s, reps, r_sets = sets;
  register char *r_array = array;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;

  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );

  /* calculate the number of inner loop repitions needed to stride the
     entire memory size at the given stride taking the fact that each
     loop does STRAIGHT number of operations */
  reps = size / (r_stride * STRAIGHT);
  
  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead */
    for( s = 0; s < r_sets; s++ ) {
      /* reset the read pointer to the start of memory less the first advance */
      read = r_array - r_stride;
      /* duplicate the operation STRAIGHT times due to problems with tight loops */
      for( r = 0; r < reps; r++ ) {
	/* advance by stride and read one byte */
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
	phony = (read += r_stride);
      }    
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of "set" rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * reps * STRAIGHT ) );
}



/*
  Provide a control time for dataReadWrap(). 
  This does everything that dataReadWrap() does except actually referencing memory.
  It allows for some measure of what overhead is incurred over the actual read
  times, as well as provides a means to guage noise in the data.

  array - memory to read
  size - total size of memory to cover (bytes)
  stride - size of stride for each read (bytes)
  sets - number of sets to run before checking time (reduce overhead due to time calls)
  return - average time per operation (ns) 
*/
double ctrlReadWrap( char *array, unsigned int size, unsigned int stride, unsigned int sets ) {

  /* use as many registers as possible between time calls, ordering is important */
  register char *read, *phony, *r_array, *wrap;
  register unsigned int r_stride, s, r_sets = sets;
  unsigned int blocks = 0;
  unsigned long usecs = 0;
  TIME_DECLARE;

  /* if no stride provided use a single word */
  r_stride = (stride) ? stride : sizeof( int );
  /* assign local pointers and compute the wrap-around address */
  read = r_array = array;
  wrap = array + size;

  /* run for at least the requested amount of time */
  do {
    /* record the starting time */
    TIME_START;

    /* force a minimum of loops to avoid time call overhead,
       duplicate the operation STRAIGHT times due to problems with tight loops */
    for( s = 0; s < r_sets; s++ ) {
      /* read one byte and advance by stride, wrapping around at end of memory */
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
      phony = (read); read += r_stride; if( read >= wrap ) read = r_array;
    }

    /* mark ending time and accumulate elapsed time in us */
    TIME_END( usecs );
    
    /* keep a count of the total number of "set" rounds */
    blocks++;    
    
    /* keep going until we've spent all of our trial time */
  } while( usecs < sys.args.runTime );
  
  /* return the average time per operation in ns */
  return( SCALE_NANOS( usecs, blocks * sets * STRAIGHT ) );
}
