/* -*- C++ -*-
 *
 * ---------------------------------------------------------------------
 * $Id: eval.h,v 1.2.4.6 2004/05/25 22:03:39 drory Exp $
 * ---------------------------------------------------------------------
 *
 * Copyright (C) 2000-2002 Niv Drory <drory@usm.uni-muenchen.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
 *
 * ---------------------------------------------------------------------
 *
 */


#ifndef __LTL_IN_FILE_MARRAY__
#error "<ltl/marray/eval.h> must be included via <ltl/marray.h>, never alone!"
#endif

#ifndef __LTL_EVAL_EXPR__
#define __LTL_EVAL_EXPR__

#include <ltl/config.h>

#ifdef LTL_DEBUG_EXPRESSIONS
#include <iostream>
#endif

LTL_BEGIN_NAMESPACE

#ifdef LTL_USE_SIMD

//@{

/*!
 *   This helper class provides specializations for evaluating expressions that 
 *   have vectrorized versions (through applicops_altivec.h). 
 *
 *   If an expression is not vectorizable, i.e. it contains operations that have
 *   no vectorized version, we don't want any of the eval_vec methods to be called
 *   (since they are useless and probably might not even compile). Therefore, we
 *   use the compile time constant Expr::isVectorizable to pick a specialization
 *   of the eval_assign_expr_vec function, so that we do not instantiate any code
 *   that touches vector evaluations if the expression is not vectorizable.
 *
 */
template<bool Vectorizable>
struct eval_vectorizable
{
};

//! specialization for vectorizable expressions
template<>
struct eval_vectorizable<1>
{
      template<class T, int N, class E>
      static inline void eval_assign_expr_vec( MArray<T,N>& a, TExpr<E,N>& e )
      {
         const int innerLoopNum = a.nelements();
         T* restrict_ dataptr = a.data();

         if( a.isStride1() && e.isStride1() && e.sameAlignmentAs( dataptr ) )
         {
#ifdef LTL_DEBUG_EXPRESSIONS
            cerr << "using Altivec/SSE vectorization\n";
            cerr << "  -- alignment : " << ((long)dataptr & 0x0FL) << endl;
#endif
            // good, both arrays have stride 1 and both arrays
            // also have contiguous memory
            // they also have the same alignment in memory
            // so we can actuially vectorize

            int j = 0;

            // handle the elements up to the next alignment boundary
            const int elemsPerVec = sizeof(typename VEC_TYPE(T))/sizeof(T);
            const int beforeAlign = (elemsPerVec - ((long)dataptr & 0x0FL)/sizeof(T))%elemsPerVec;
            typename VEC_TYPE(T)* restrict_ vecptr_ = (typename VEC_TYPE(T)*)(dataptr+beforeAlign);

            for( ; j<beforeAlign; ++j )
               dataptr[j] = e.readWithoutStride(j);

            // now we can use the vector facilities
            const int vecLoopCount = (innerLoopNum - beforeAlign)/elemsPerVec;
            int k = 0;

            // unroll the inner vector loop
            const int vecLoopCountMod4 = vecLoopCount & 3;
            for( ; k<vecLoopCountMod4; ++k )
               vecptr_[k] = e.readVec(k);

            for( ; k<vecLoopCount; k+=4 )
            {
               // MArrayIter::readVec will prefetch (with GCC) 4 vectors ahead
               typename VEC_TYPE(T) tmp1 = e.readVec(k  );
               typename VEC_TYPE(T) tmp2 = e.readVec(k+1);
               typename VEC_TYPE(T) tmp3 = e.readVec(k+2);
               typename VEC_TYPE(T) tmp4 = e.readVec(k+3);
               vecptr_[k  ] = tmp1;
               vecptr_[k+1] = tmp2;
               vecptr_[k+2] = tmp3;
               vecptr_[k+3] = tmp4;
            }

            // handle the remainig elements
            j += vecLoopCount*elemsPerVec;
            for( ; j<innerLoopNum; ++j )
               dataptr[j] = e.readWithoutStride(j);
         }
         else
            eval_assign_expr_1( a, e );
      }
};



/*!
 *  specialization for non-vectorizable expressions:
 *    just call standard eval_assign_expr_1()  
 */
template<>
struct eval_vectorizable<0>
{
      template<class T, int N, class E>
      static inline void eval_assign_expr_vec( MArray<T,N>& a, TExpr<E,N>& e )
      {
            return eval_assign_expr_1( a, e );
      }

};

//@}
#endif  //LTL_USE_SIMD


/*!
 *  This function is called from MArray::operator= ( TExpr<>& e )
 *  to actually perform the evaluation and assignment
 */
template<class T, int N, class E>
inline void eval_assign_expr( MArray<T,N>& a, TExpr<E,N>& e )
{
   CHECK_CONFORM( e, a );

   // if there are index iterators present, we have no choice but to use
   // the slow evaluation function without any loop collapsing and optimized
   // access patterns, because we have to keep track of the actual indices for
   // the index iterators ...
   if( TExpr<E,N>::numIndexIter != 0 )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "found " << TExpr<E,N>::numIndexIter << " index iters\n";
#endif
      return eval_with_index_iter( a, e );
   }

   // if the expression is 1-dimensional or if the memory layout of all
   // operands is contiguous, we can use fast 1-dimensional traversal
   if( N == 1 || (a.isStorageContiguous() && e.isStorageContiguous()) )
   {
#ifdef LTL_USE_SIMD
      // this will decide at compile time whether we have an expression consisting
      // only of terms which have vectorized implememtations. If so, we can
      // try to vectorize it. If not, just use the scalar code.
      return eval_vectorizable<TExpr<E,N>::isVectorizable>::eval_assign_expr_vec( a, e );
#else
      // scalar version
      return eval_assign_expr_1( a, e );
#endif
   }
   else
   {
      // general case, we can't perform any of the (pseudo) 1-D optimizations
      return eval_assign_expr_N( a, e );
   }
}



//! We are dealing with the 1 dimensional scalar case here
//  (see vectorized version above)
//
template<class T, int N, class E>
void eval_assign_expr_1( MArray<T,N>& a, TExpr<E,N>& e )
{
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating with fully collapsed loops\n";
#endif

   const int innerLoopNum = a.nelements();
   T* restrict_ dataptr = a.data();

   if( a.isStride1() && e.isStride1() )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating with common stride 1\n";
#endif
      // good, both arrays have stride 1 and both arrays
      // also have contiguous memory

      const int loopMod4 = innerLoopNum & 3;
      int j=0;
      for( ; j<loopMod4; ++j )
         dataptr[j] = e.readWithoutStride(j);

      for( ; j<innerLoopNum; j+=4 )
      {
         // gcc does poor aliasing analysis (in spite of use of restrict),
         // therefore we have to make it clear that writing to the result
         // array does not invalidate _any_ of the data associated with the
         // expression or its iterators (otherwise it reloads the data_
         // pointers of the iterators every time)    :-(
         typename E::value_type tmp1 = e.readWithoutStride(j  );
         typename E::value_type tmp2 = e.readWithoutStride(j+1);
         typename E::value_type tmp3 = e.readWithoutStride(j+2);
         typename E::value_type tmp4 = e.readWithoutStride(j+3);

         dataptr[j  ] = tmp1;
         dataptr[j+1] = tmp2;
         dataptr[j+2] = tmp3;
         dataptr[j+3] = tmp4;
      }
   }
   else
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating without common stride\n";
#endif
      // well, then slightly less efficient
      const int loopMod4 = innerLoopNum & 3;
      const int stride = a.stride(1);

      int j=0, k=0;
      for( ; j<loopMod4; ++j, k+=stride )
         dataptr[j*stride] = e.readWithStride(j);

      for( ; j<innerLoopNum; j+=4, k+=4*stride )
      {
         typename E::value_type tmp1 = e.readWithStride(j  );
         typename E::value_type tmp2 = e.readWithStride(j+1);
         typename E::value_type tmp3 = e.readWithStride(j+2);
         typename E::value_type tmp4 = e.readWithStride(j+3);

         dataptr[k         ] = tmp1;
         dataptr[k+  stride] = tmp2;
         dataptr[k+2*stride] = tmp3;
         dataptr[k+3*stride] = tmp4;
      }
   }
}



//! this handles the N-dimensional case
//
template<class T, int N, class E>
void eval_assign_expr_N( MArray<T,N>& a, TExpr<E,N>& e )
{
   // we already know that the storage ist not contiguous.
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating with stack traversal\n";
#endif

   const int innerLoopNum = a.length(1);
   const int loopMod4 = innerLoopNum & 3;
   int n = a.nelements();

   typename MArray<T,N>::iterator i = a.begin();

   if( a.isStride1() && e.isStride1() )
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating with common stride 1\n";
#endif

      while( n )
      {
         T* restrict_ dataptr = i.data();

         int j=0;
         for( ; j<loopMod4; ++j )
            dataptr[j] = e.readWithoutStride(j);

         for( ; j<innerLoopNum; j+=4 )
         {
            // gcc does poor aliasing analysis (in spite of use of restrict),
            // therefore we have to make it clear that writing to the result
            // array does not invalidate _any_ of the data associated with the
            // expression or its iterators (otherwise it reloads the data_
            // pointers of the iterators every time)    :-(
            typename E::value_type tmp1 = e.readWithoutStride(j  );
            typename E::value_type tmp2 = e.readWithoutStride(j+1);
            typename E::value_type tmp3 = e.readWithoutStride(j+2);
            typename E::value_type tmp4 = e.readWithoutStride(j+3);

            dataptr[j  ] = tmp1;
            dataptr[j+1] = tmp2;
            dataptr[j+2] = tmp3;
            dataptr[j+3] = tmp4;
         }
         i.advanceN( innerLoopNum );
         e.advanceN( innerLoopNum );
         i.advanceDim();
         e.advanceDim();
         n -= innerLoopNum;
      }
      return;
   }
   else
   {
#ifdef LTL_DEBUG_EXPRESSIONS
      cerr << "evaluating without common stride\n";
#endif

      const int stride = a.stride(1);

      while( n )
      {
         T* restrict_ dataptr = i.data();

         int j=0, k=0;
         for( ; j<loopMod4; ++j, k+=stride )
            dataptr[j*stride] = e.readWithStride(j);

         for( ; j<innerLoopNum; j+=4, k+=4*stride )
         {
            typename E::value_type tmp1 = e.readWithStride(j  );
            typename E::value_type tmp2 = e.readWithStride(j+1);
            typename E::value_type tmp3 = e.readWithStride(j+2);
            typename E::value_type tmp4 = e.readWithStride(j+3);

            dataptr[k         ] = tmp1;
            dataptr[k+  stride] = tmp2;
            dataptr[k+2*stride] = tmp3;
            dataptr[k+3*stride] = tmp4;
         }
         i.advanceN( innerLoopNum );
         e.advanceN( innerLoopNum );
         i.advanceDim();
         e.advanceDim();
         n -= innerLoopNum;
      }
   }
}


// if we have index expressions involved, we cannot use the above
// optimizations, since we need to keep track of where we are
//
template<class T, int N, class E>
void eval_with_index_iter( MArray<T,N>& a, TExpr<E,N>& e )
{
#ifdef LTL_DEBUG_EXPRESSIONS
   cerr << "evaluating with pure stack traversal due to IndexIter\n";
#endif
   // we have an index iterator in the expression,
   // so we cannot do much ...
   typename MArray<T,N>::iterator i = a.begin();
   if( i.isStride1() && e.isStride1() )
   {
      while( !i.done() )
      {
         *i = *e;
         i.advanceWithStride1();
         e.advanceWithStride1();
         if( i.needAdvanceDim() )
         {
            i.advanceDim();
            e.advanceDim();
         }
      }
   }
   else
   {
      while( !i.done() )
      {
         *i = *e;
         i.advance();
         e.advance();
         if( i.needAdvanceDim() )
         {
            i.advanceDim();
            e.advanceDim();
         }
      }
   }
}

LTL_END_NAMESPACE

#endif
