/* -*- C++ -*-
*
* ---------------------------------------------------------------------
* $Id: applicops_sse.h,v 1.1.2.2 2004/06/30 22:53:32 drory Exp $
* ---------------------------------------------------------------------
*
* Copyright (C) 2000-2002 Niv Drory <drory@usm.uni-muenchen.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
*
* ---------------------------------------------------------------------
*
*/


#if !defined(__LTL_APPLICOPS_H__)
#error "<ltl/misc/applicops_sse.h> must be included after <ltl/misc/applicops.h> !"
#endif

#if !defined(__SSE2__)
#error "<ltl/misc/applicops_sse.h> needs SSE/SSE2 extensions to be enabled.\nConsult your compiler manual on how to do so\nUse the flag(s) -msse2 in GCC"
#endif

#ifndef __LTL_APPLICOPS_SIMD_H__
#define __LTL_APPLICOPS_SIMD_H__


LTL_BEGIN_NAMESPACE

/*! \file applicops_sse.h

  Specializations of applicative templates using the MMX/SSE/SSE2
  vector instructions on x86 (Pentium4 and above)
*/

//@{

/*!
   traits mapping between scalar and vector types
   for vectorizing applicops. SSE has fewer types than Altivec.
*/
typedef char      vqi __attribute__ ((mode(V8QI)));
typedef short     vhi __attribute__ ((mode(V4HI)));
typedef int       vsi __attribute__ ((mode(V2SI)));
typedef float     vsf __attribute__ ((mode(V4SF)));
typedef double    vdf __attribute__ ((mode(V2DF)));

template<typename T> 
struct vec_trait
{
};

template<> struct vec_trait<bool>
{
      typedef vsi vec_type;
};

template<> struct vec_trait<char>
{
      typedef vqi vec_type;
};

template<> struct vec_trait<short>
{
      typedef vhi vec_type;
};

template<> struct vec_trait<int>
{
      typedef vsi vec_type;
};

template<> struct vec_trait<float>
{
      typedef vsf vec_type;
};

template<> struct vec_trait<double>
{
      typedef vdf vec_type;
};

#define VEC_TYPE(type)     vec_trait<type>::vec_type
#define T_VEC_TYPE(type)   typename VEC_TYPE(type)

//@}


//@{

/*!
 *  implemetation of some missing math functions using SSE primitives
 */
//@}

/*!
 *  Applicative templates for vectorized binary operators returning the
 *  c-style promoted type of their inputs.
 *  These are specializations of the applicative templates used for scalar
 *  processing. If a specialization for the given operation and its 
 *  argument types exists, the expression can be vectorized.
 */
#define MAKE_BINAP_VEC(classname,op,type,vec_op)                        \
template<>                                                              \
struct classname<type,type> : public _et_applic_base                    \
{                                                                       \
   enum { isVectorizable = 1 };                                         \
   typedef promotion_trait<type,type>::PType value_type;                \
   static inline value_type eval( const type& a, const type& b )        \
     { return a op b; }                                                 \
                                                                        \
   static inline VEC_TYPE(type) eval_vec( const VEC_TYPE(type) & a,     \
                                          const VEC_TYPE(type) & b )    \
     { return (VEC_TYPE(type))vec_op(a,b); }                            \
}

/*!
 *  applicative templates for vector binary functions returning the same type
 *  as their input
 */
#define MAKE_BINAP_FUNC_VEC(classname,op,type,vec_op)                   \
template<>                                                              \
struct classname<type,type> : public _et_applic_base                    \
{                                                                       \
   enum { isVectorizable = 1 };                                         \
   typedef promotion_trait<type,type>::PType value_type;                \
   static inline value_type eval( const type& a, const type& b )        \
   { return op(a,b); }                                                  \
                                                                        \
   static inline VEC_TYPE(type) eval_vec( const VEC_TYPE(type) & a,     \
                                          const VEC_TYPE(type) & b )    \
     { return (VEC_TYPE(type))vec_op(a,b); }                            \
}

/*!
 *  applicative templates for vector unary operators returning the same type
 *  as their input
 */
#define MAKE_UNAP_VEC(classname,op,type,vec_op)                         \
template<>                                                              \
struct classname<type> : public _et_applic_base                         \
{                                                                       \
   enum { isVectorizable = 1 };                                         \
   typedef type value_type;                                             \
   static inline type eval( const type& a )                             \
     { return op(a); }                                                  \
   static inline VEC_TYPE(type) eval_vec( const VEC_TYPE(type) & a )    \
     { return (VEC_TYPE(type))vec_op(a); }                              \
}

/*!
 *   applicative templates for unary functions
 */
#define MAKE_UNAP_FUNC_VEC(classname,type,op,vec_op)                    \
template<>                                                              \
struct classname<type> : public _et_applic_base                         \
{                                                                       \
   enum { isVectorizable = 1 };                                         \
   typedef type value_type;                                             \
   static inline type eval( const type& a )                             \
     { return op(a); }                                                  \
   static inline VEC_TYPE(type) eval_vec( const VEC_TYPE(type) & a )    \
     { return vec_op(a); }                                              \
}


// unfortunately, MMX/SSE/SSE2/SSE3 define much less operations
// compared to Altivec, but we can at least use those. 
//    TODO: We should try and implement more operations in terms of 
//          these primitives  
MAKE_BINAP_VEC(__ltl_TAdd, +, char  , __builtin_ia32_paddb);
MAKE_BINAP_VEC(__ltl_TAdd, +, short , __builtin_ia32_paddw);
MAKE_BINAP_VEC(__ltl_TAdd, +, int   , __builtin_ia32_paddd);
MAKE_BINAP_VEC(__ltl_TAdd, +, float , __builtin_ia32_addps);


MAKE_BINAP_VEC(__ltl_TSub, -, char  , __builtin_ia32_psubb);
MAKE_BINAP_VEC(__ltl_TSub, -, short , __builtin_ia32_psubw);
MAKE_BINAP_VEC(__ltl_TSub, -, int   , __builtin_ia32_psubd);
MAKE_BINAP_VEC(__ltl_TSub, -, float , __builtin_ia32_subps);

MAKE_BINAP_VEC(__ltl_TMul, *, float , __builtin_ia32_mulps);

MAKE_BINAP_VEC(__ltl_TDiv, /, float , __builtin_ia32_divps);

#ifdef __SSE3__
MAKE_BINAP_VEC(__ltl_TAdd, +, double, __builtin_ia32_haddpd);
MAKE_BINAP_VEC(__ltl_TSub, -, double, __builtin_ia32_hsubpd);
MAKE_BINAP_VEC(__ltl_TMul, *, double, __builtin_ia32_mulpd);
MAKE_BINAP_VEC(__ltl_TDiv, /, double, __builtin_ia32_divpd);
#endif

MAKE_BINAP_VEC(__ltl_TEQ , ==, char ,  __builtin_ia32_pcmpeqb);
MAKE_BINAP_VEC(__ltl_TEQ , ==, short,  __builtin_ia32_pcmpeqw);
MAKE_BINAP_VEC(__ltl_TEQ , ==, int  ,  __builtin_ia32_pcmpeqd);

MAKE_BINAP_VEC(__ltl_TEQ , ==, float, __builtin_ia32_cmpeqps);
MAKE_BINAP_VEC(__ltl_TGT , > , float, __builtin_ia32_cmpgtps);
MAKE_BINAP_VEC(__ltl_TLT , < , float, __builtin_ia32_cmpltps);
MAKE_BINAP_VEC(__ltl_TLE , <=, float, __builtin_ia32_cmpleps);
MAKE_BINAP_VEC(__ltl_TGE , >=, float, __builtin_ia32_cmpgeps);

MAKE_UNAP_FUNC_VEC(__ltl_sqrt, float , std::sqrt, __builtin_ia32_sqrtps);
MAKE_UNAP_FUNC_VEC(__ltl_sqrt, double, std::sqrt, __builtin_ia32_sqrtpd);

LTL_END_NAMESPACE

#endif // __LTL_APPLOCOPS_SIMD_H__
