/* -*- C++ -*-
 *
 * ---------------------------------------------------------------------
 * $Id: eval_reduc.h,v 1.3.2.1 2003/11/02 23:23:43 cag Exp $
 * ---------------------------------------------------------------------
 *
 * Copyright (C) 2000-2002 Niv Drory <drory@usm.uni-muenchen.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA 
 *
 * ---------------------------------------------------------------------
 *
 */


#ifndef __LTL_REDUCTIONS__
#error "<ltl/marray/eval_reduc.h> must be included via <ltl/statistics.h>, never alone!"
#endif


#ifndef __LTL_REDUCE__
#define __LTL_REDUCE__

#include <ltl/config.h>

LTL_BEGIN_NAMESPACE

template<int N, class E, class Reduction>
inline void eval_full_reduction( TExpr<E,N> e, Reduction& R )
{
   if( TExpr<E,N>::numIndexIter != 0 )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "found " << TExpr<E,N>::numIndexIter << " index iters\n";
#endif

      return eval_full_reduction_with_index( e, R );
   }

   // if the expression is 1-dimensional or if the memory layout of all
   // operands is contiguous, we can use fast 1-dimensional traversal
   if( N == 1 || e.isStorageContiguous() )
      return eval_full_reduction_1( e, R );
   else
      return eval_full_reduction_N( e, R );
}


// we are dealing with the 1 dimensional case
//
template<int N, class E, class Reduction>
void eval_full_reduction_1( TExpr<E,N> e, Reduction& R )
{
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating full reduction with collapsed loops\n";
#endif

   // copy the Reduction object to our local stack frame
   // so that the compiler sees that it can hold the members in registers
   // avoiding writing back to memory after every loop step
   Reduction R1(R);

   int innerLoopNum = e.shape()->nelements();
   const int loopMod4 = innerLoopNum & 3;
   
   if( e.isStride1() )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating full reduction with stride1\n";
#endif
      // good, both arrays have stride 1, this means both arrays
      // also have contiguous memory

      int j=0;
      for( ; j<loopMod4; ++j )
         if( !R1.evaluate( e.readWithoutStride(j) ) )
            break;
         
      for( ; j<innerLoopNum; j+=4 )
      {
         if( !R1.evaluate( e.readWithoutStride(j) ) )
            break;
         if( !R1.evaluate( e.readWithoutStride(j+1) ) )
            break;
         if( !R1.evaluate( e.readWithoutStride(j+2) ) )
            break;
         if( !R1.evaluate( e.readWithoutStride(j+3) ) )
            break;
      }
   }
   else
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating full reduction without common stride\n";
#endif
      // well, then slightly less efficient
      int i=0;
      for( ; i<loopMod4; ++i )
         if( !R1.evaluate( e.readWithStride(i) ) )
            break;
         
      for( ; i<innerLoopNum; i+=4 )
      {
         if( !R1.evaluate( e.readWithStride(i) ) )
            break;
         if( !R1.evaluate( e.readWithStride(i+1) ) )
            break;
         if( !R1.evaluate( e.readWithStride(i+2) ) )
            break;
         if( !R1.evaluate( e.readWithStride(i+3) ) )
            break;
      }
      
   }
   // copy results back
   R.copyResult( R1 );
}



// now the N-dimensional case
//
template<int N, class E, class Reduction>
void eval_full_reduction_N( TExpr<E,N> e, Reduction& R )
{
   // we already know that the storage ist not contiguous.
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating full reduction with stack traversal\n";
#endif

   // copy the Reduction object to our local stack frame
   // so that the compiler sees that it can hold the members in registers
   // avoiding writing back to memory after every loop step
   Reduction R1(R);

   const int innerLoopNum = e.shape()->length(1);
   int n = e.shape()->nelements();
   bool loop = true;
   
   if( e.isStride1() )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating full reduction with stride 1\n";
#endif

      while( n && loop )
      {
         int j=0;
         for( ; j<innerLoopNum; ++j )
            if( !R1.evaluate( e.readWithoutStride(j) ) )
            {
               loop = false;
               break;
            }

         e.advanceN( innerLoopNum );
         e.advanceDim();
         n -= innerLoopNum;
      }
   }
   else
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating without common stride\n";
#endif

      while( n && loop )
      {
         int j=0;
         for( ; j<innerLoopNum; ++j )
            if( !R1.evaluate( e.readWithStride(j) ) )
            {
               loop = false;
               break;
            }
         e.advanceN( innerLoopNum );
         e.advanceDim();
         n -= innerLoopNum;
      }
   }
   R.copyResult( R1 );
}



// if we have index expressions involved, we cannot use the above
// optimizations, since we need to keep track of where we are
//
template<int N, class E, class Reduction>
void eval_full_reduction_with_index( TExpr<E,N> e, Reduction& R )
{
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating with pure stack traversal due to IndexIter\n";
#endif

   // copy the Reduction object to our local stack frame
   // so that the compiler sees that it can hold the members in registers
   // avoiding writing back to memory after every loop step
   Reduction R1(R);

   // we have an index iterator in the expression,
   // so we cannot do any loop unrolling or the like ...
   ShapeIter<N> i( *e.shape() );

   if( e.isStride1() )
   {
      while( !i.done() )
      {
         if( !R1.evaluate( *e ) )
            break;

         i.advanceWithStride1();
         e.advanceWithStride1();
         if( i.needAdvanceDim() )
         {
            i.advanceDim();
            e.advanceDim();
         }
      }
   }
   else
   {
      while( !i.done() )
      {
         if( !R1.evaluate( *e ) )
            break;

         i.advance();
         e.advance();
         if( i.needAdvanceDim() )
         {
            i.advanceDim();
            e.advanceDim();
         }
      }
   }
   R.copyResult( R1 );
}

LTL_END_NAMESPACE

#endif

