/*

    x86 specific optimized assembler dsp routines
    Copyright (C) 2001-2002 Jussi Laako

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/


#ifdef DSP_X86


#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <float.h>

#include "dsp/X86.h"


static char cpCPUid[13];


#ifdef __cplusplus
extern "C"
{
#endif


const char *dsp_x86_cpuid ()
{
    unsigned int *ipCPUid = (unsigned int *) cpCPUid;
    
    X86_ASM (
        "pushl %%eax\n\t" \
        "pushl %%ebx\n\t" \
        "pushl %%ecx\n\t" \
        "pushl %%edx\n\t" \
        "xorl %%eax, %%eax\n\t" \
        "cpuid\n\t" \
        "movl %%ebx, %0\n\t" \
        "movl %%ecx, %2\n\t" \
        "movl %%edx, %1\n\t" \
        "popl %%edx\n\t" \
        "popl %%ecx\n\t" \
        "popl %%ebx\n\t" \
        "popl %%eax\n\t"
        : "=m" (ipCPUid[0]),
          "=m" (ipCPUid[1]),
          "=m" (ipCPUid[2])
        :
        : "eax", "ebx", "ecx", "edx", "memory");
    cpCPUid[12] = '\0';

    return cpCPUid;
}


unsigned int dsp_x86_features ()
{
    unsigned int uiFeatures = 0;
    
    X86_ASM (
        "pushl %%eax\n\t" \
        "pushl %%ebx\n\t" \
        "pushl %%ecx\n\t" \
        "pushl %%edx\n\t" \
        "movl $1, %%eax\n\t" \
        "cpuid\n\t" \
        "movl %%edx, %0\n\t" \
        "popl %%edx\n\t" \
        "popl %%ecx\n\t" \
        "popl %%ebx\n\t" \
        "popl %%eax\n\t"
        : "=m" (uiFeatures)
        :
        : "eax", "ebx", "ecx", "edx", "memory");
    
    return uiFeatures;
}


unsigned int dsp_x86_amd_features ()
{
    unsigned int uiFunction = 0x80000001;
    unsigned int uiFeatures = 0;
    
    X86_ASM (
        "pushl %%eax\n\t" \
        "pushl %%ebx\n\t" \
        "pushl %%ecx\n\t" \
        "pushl %%edx\n\t" \
        "movl %1, %%eax\n\t" \
        "cpuid\n\t" \
        "movl %%edx, %0\n\t" \
        "popl %%edx\n\t" \
        "popl %%ecx\n\t" \
        "popl %%ebx\n\t" \
        "popl %%eax\n\t"
        : "=m" (uiFeatures)
        : "m" (uiFunction)
        : "eax", "ebx", "ecx", "edx", "memory");
    
    return uiFeatures;
}


extern int dsp_x86_have_e3dnow ()
{
    unsigned int uiFeatures;

    if (strcmp(dsp_x86_cpuid(), "AuthenticAMD") == 0)
    {
        uiFeatures = dsp_x86_amd_features();
        if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
            return 1;
    }
    return 0;
}


extern int dsp_x86_have_sse2 ()
{
    unsigned int uiFeatures;
    
    uiFeatures = dsp_x86_features();
    if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
        return 1;
    return 0;
}


void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
{
    int iStartIdx;
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc = (stpm64) fpSrc;
    
    iStartIdx = 0;
    //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
    X86_ASM (
        "prefetchnta %0\n\t" \
        "prefetchnta %1\n\t" \
        "prefetchnta %2\n\t" \
        "prefetchnta %3\n\t"
        :
        : "m" (m64pSrc[0]),
          "m" (m64pSrc[8]),
          "m" (m64pSrc[16]),
          "m" (m64pSrc[24]));
    //#endif
    iDataCount = ((iDataLength & 0xfffffff0) >> 1);
    for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
    {
        //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
        X86_ASM (
            "prefetchnta %16\n\t" \
            "movq %8, %%mm0\n\t" \
            "movq %9, %%mm1\n\t" \
            "movq %10, %%mm2\n\t" \
            "movq %11, %%mm3\n\t" \
            "movq %12, %%mm4\n\t" \
            "movq %13, %%mm5\n\t" \
            "movq %14, %%mm6\n\t" \
            "movq %15, %%mm7\n\t" \
            "movntq %%mm0, %0\n\t" \
            "movntq %%mm1, %1\n\t" \
            "movntq %%mm2, %2\n\t" \
            "movntq %%mm3, %3\n\t" \
            "movntq %%mm4, %4\n\t" \
            "movntq %%mm5, %5\n\t" \
            "movntq %%mm6, %6\n\t" \
            "movntq %%mm7, %7\n\t"
            : "=m" (m64pDest[iDataCntr]),
              "=m" (m64pDest[iDataCntr + 1]),
              "=m" (m64pDest[iDataCntr + 2]),
              "=m" (m64pDest[iDataCntr + 3]),
              "=m" (m64pDest[iDataCntr + 4]),
              "=m" (m64pDest[iDataCntr + 5]),
              "=m" (m64pDest[iDataCntr + 6]),
              "=m" (m64pDest[iDataCntr + 7])
            : "m" (m64pSrc[iDataCntr]),
              "m" (m64pSrc[iDataCntr + 1]),
              "m" (m64pSrc[iDataCntr + 2]),
              "m" (m64pSrc[iDataCntr + 3]),
              "m" (m64pSrc[iDataCntr + 4]),
              "m" (m64pSrc[iDataCntr + 5]),
              "m" (m64pSrc[iDataCntr + 6]),
              "m" (m64pSrc[iDataCntr + 7]),
              "m" (m64pSrc[iDataCntr + 32])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
        /*#else
        X86_ASM (
            "movq %8, %%mm0\n\t" \
            "movq %9, %%mm1\n\t" \
            "movq %10, %%mm2\n\t" \
            "movq %11, %%mm3\n\t" \
            "movq %12, %%mm4\n\t" \
            "movq %13, %%mm5\n\t" \
            "movq %14, %%mm6\n\t" \
            "movq %15, %%mm7\n\t" \
            "movntq %%mm0, %0\n\t" \
            "movntq %%mm1, %1\n\t" \
            "movntq %%mm2, %2\n\t" \
            "movntq %%mm3, %3\n\t" \
            "movntq %%mm4, %4\n\t" \
            "movntq %%mm5, %5\n\t" \
            "movntq %%mm6, %6\n\t" \
            "movntq %%mm7, %7\n\t"
            : "=m" (m64pDest[iDataCntr]),
              "=m" (m64pDest[iDataCntr + 1]),
              "=m" (m64pDest[iDataCntr + 2]),
              "=m" (m64pDest[iDataCntr + 3]),
              "=m" (m64pDest[iDataCntr + 4]),
              "=m" (m64pDest[iDataCntr + 5]),
              "=m" (m64pDest[iDataCntr + 6]),
              "=m" (m64pDest[iDataCntr + 7])
            : "m" (m64pSrc[iDataCntr]),
              "m" (m64pSrc[iDataCntr + 1]),
              "m" (m64pSrc[iDataCntr + 2]),
              "m" (m64pSrc[iDataCntr + 3]),
              "m" (m64pSrc[iDataCntr + 4]),
              "m" (m64pSrc[iDataCntr + 5]),
              "m" (m64pSrc[iDataCntr + 6]),
              "m" (m64pSrc[iDataCntr + 7]),
              "m" (m64pSrc[iDataCntr + 32])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
        #endif*/
    }
    iStartIdx = iDataCount;
    iDataCount = ((iDataLength & 0xfffffffe) >> 1);
    for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
    {
        //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
        X86_ASM (
            "prefetchnta %2\n\t" \
            "movq %1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc[iDataCntr]),
              "m" (m64pSrc[iDataCntr + 32])
            : "mm0", "memory");
        /*#else
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc[iDataCntr])
            : "mm0", "memory");
        #endif*/
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "m" (fpSrc[iDataLength - 1])
            : "mm0", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
{
    int iStartIdx;
    int iDataCntr;
    int iDataCount;
    
    iStartIdx = 0;
    //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
    X86_ASM (
        "prefetchnta %0\n\t" \
        "prefetchnta %1\n\t" \
        "prefetchnta %2\n\t" \
        "prefetchnta %3\n\t"
        :
        : "m" (dpSrc[0]),
          "m" (dpSrc[8]),
          "m" (dpSrc[16]),
          "m" (dpSrc[24]));
    //#endif
    iDataCount = (iDataLength & 0xfffffff8);
    for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
    {
        //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
        X86_ASM (
            "prefetchnta %16\n\t" \
            "movq %8, %%mm0\n\t" \
            "movq %9, %%mm1\n\t" \
            "movq %10, %%mm2\n\t" \
            "movq %11, %%mm3\n\t" \
            "movq %12, %%mm4\n\t" \
            "movq %13, %%mm5\n\t" \
            "movq %14, %%mm6\n\t" \
            "movq %15, %%mm7\n\t" \
            "movntq %%mm0, %0\n\t" \
            "movntq %%mm1, %1\n\t" \
            "movntq %%mm2, %2\n\t" \
            "movntq %%mm3, %3\n\t" \
            "movntq %%mm4, %4\n\t" \
            "movntq %%mm5, %5\n\t" \
            "movntq %%mm6, %6\n\t" \
            "movntq %%mm7, %7\n\t"
            : "=m" (dpDest[iDataCntr]),
              "=m" (dpDest[iDataCntr + 1]),
              "=m" (dpDest[iDataCntr + 2]),
              "=m" (dpDest[iDataCntr + 3]),
              "=m" (dpDest[iDataCntr + 4]),
              "=m" (dpDest[iDataCntr + 5]),
              "=m" (dpDest[iDataCntr + 6]),
              "=m" (dpDest[iDataCntr + 7])
            : "m" (dpSrc[iDataCntr]),
              "m" (dpSrc[iDataCntr + 1]),
              "m" (dpSrc[iDataCntr + 2]),
              "m" (dpSrc[iDataCntr + 3]),
              "m" (dpSrc[iDataCntr + 4]),
              "m" (dpSrc[iDataCntr + 5]),
              "m" (dpSrc[iDataCntr + 6]),
              "m" (dpSrc[iDataCntr + 7]),
              "m" (dpSrc[iDataCntr + 32])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
        /*#else
        X86_ASM (
            "movq %8, %%mm0\n\t" \
            "movq %9, %%mm1\n\t" \
            "movq %10, %%mm2\n\t" \
            "movq %11, %%mm3\n\t" \
            "movq %12, %%mm4\n\t" \
            "movq %13, %%mm5\n\t" \
            "movq %14, %%mm6\n\t" \
            "movq %15, %%mm7\n\t" \
            "movntq %%mm0, %0\n\t" \
            "movntq %%mm1, %1\n\t" \
            "movntq %%mm2, %2\n\t" \
            "movntq %%mm3, %3\n\t" \
            "movntq %%mm4, %4\n\t" \
            "movntq %%mm5, %5\n\t" \
            "movntq %%mm6, %6\n\t" \
            "movntq %%mm7, %7\n\t"
            : "=m" (dpDest[iDataCntr]),
              "=m" (dpDest[iDataCntr + 1]),
              "=m" (dpDest[iDataCntr + 2]),
              "=m" (dpDest[iDataCntr + 3]),
              "=m" (dpDest[iDataCntr + 4]),
              "=m" (dpDest[iDataCntr + 5]),
              "=m" (dpDest[iDataCntr + 6]),
              "=m" (dpDest[iDataCntr + 7])
            : "m" (dpSrc[iDataCntr]),
              "m" (dpSrc[iDataCntr + 1]),
              "m" (dpSrc[iDataCntr + 2]),
              "m" (dpSrc[iDataCntr + 3]),
              "m" (dpSrc[iDataCntr + 4]),
              "m" (dpSrc[iDataCntr + 5]),
              "m" (dpSrc[iDataCntr + 6]),
              "m" (dpSrc[iDataCntr + 7]),
              "m" (dpSrc[iDataCntr + 32])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
        #endif*/
    }
    iStartIdx = iDataCount;
    iDataCount = iDataLength;
    for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
    {
        //#if ((__GNUC__ < 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ < 1)))
        X86_ASM (
            "prefetchnta %2\n\t" \
            "movq %1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (dpDest[iDataCntr])
            : "m" (dpSrc[iDataCntr]),
              "m" (dpSrc[iDataCntr + 32])
            : "mm0", "memory");
        /*#else
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (dpDest[iDataCntr])
            : "m" (dpSrc[iDataCntr])
            : "mm0", "memory");
        #endif*/
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pVect = (stpm64) fpVect;
    stm64 m64Src;

    m64Src.f[0] = m64Src.f[1] = fSrc;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t"
        :
        : "m" (m64Src)
        : "mm1", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pVect[iDataCntr])
            : "0" (m64pVect[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpVect[iDataLength - 1])
            : "0" (fpVect[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pVect = (stpm64) fpVect;
    stm64 m64Src;

    m64Src.f[0] = m64Src.f[1] = fSrc;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t"
        :
        : "m" (m64Src)
        : "mm1", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pVect[iDataCntr])
            : "0" (m64pVect[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpVect[iDataLength - 1])
            : "0" (fpVect[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2, 
    int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stm64 m64Src2;

    m64Src2.f[0] = m64Src2.f[1] = fSrc2;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t"
        :
        : "m" (m64Src2)
        : "mm1", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc1[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "m" (fpSrc1[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc = (stpm64) fpSrc;

    iDataCount = (iDataLength >> 1);
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "0" (m64pDest[iDataCntr]),
              "m" (m64pSrc[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "movd %2, %%mm1\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "0" (fpDest[iDataLength - 1]),
              "m" (fpSrc[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc = (stpm64) fpSrc;

    iDataCount = (iDataLength >> 1);
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "0" (m64pDest[iDataCntr]),
              "m" (m64pSrc[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "movd %2, %%mm1\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "0" (fpDest[iDataLength - 1]),
              "m" (fpSrc[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1, 
    const float *fpSrc2, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stpm64 m64pSrc2 = (stpm64) fpSrc2;

    iDataCount = (iDataLength >> 1);
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc1[iDataCntr]),
              "m" (m64pSrc2[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "movd %2, %%mm1\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "m" (fpSrc1[iDataLength - 1]),
              "m" (fpSrc2[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1, 
    const float *fpSrc2, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stpm64 m64pSrc2 = (stpm64) fpSrc2;

    iDataCount = (iDataLength >> 1);
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc1[iDataCntr]),
              "m" (m64pSrc2[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "movd %2, %%mm1\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "m" (fpSrc1[iDataLength - 1]),
              "m" (fpSrc2[iDataLength - 1])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
{
    int iDataCntr;
    stpm64 m64pDest = (stpm64) fpDest;
    
    X86_ASM (
        "movq %0, %%mm3\n\t"
        :
        : "m" (fpSrc)
        : "mm3", "memory");
    for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %%mm3, %%mm1\n\t" \
            "pswapd %%mm1, %%mm2\n\t" \
            "pfmul %%mm0, %%mm1\n\t" \
            "pfmul %%mm0, %%mm2\n\t" \
            "pfpnacc %%mm2, %%mm1\n\t"
            "movntq %%mm1, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "0" (m64pDest[iDataCntr])
            : "mm0", "mm1", "mm2", "mm3", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
{
    int iDataCntr;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc = (stpm64) fpSrc;
    
    for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pswapd %%mm1, %%mm2\n\t" \
            "pfmul %%mm0, %%mm1\n\t" \
            "pfmul %%mm0, %%mm2\n\t" \
            "pfpnacc %%mm2, %%mm1\n\t"
            "movntq %%mm1, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "0" (m64pDest[iDataCntr]),
              "m" (m64pSrc[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1, 
    const float *fpSrc2, int iDataLength)
{
    int iDataCntr;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stpm64 m64pSrc2 = (stpm64) fpSrc2;
    
    for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "movq %2, %%mm1\n\t" \
            "pswapd %%mm1, %%mm2\n\t" \
            "pfmul %%mm0, %%mm1\n\t" \
            "pfmul %%mm0, %%mm2\n\t" \
            "pfpnacc %%mm2, %%mm1\n\t"
            "movntq %%mm1, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc1[iDataCntr]),
              "m" (m64pSrc2[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pVect = (stpm64) fpVect;
    stm64 m64Mul;
    stm64 m64Add;

    m64Mul.f[0] = m64Mul.f[1] = fMul;
    m64Add.f[0] = m64Add.f[1] = fAdd;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t" \
        "movq %1, %%mm2\n\t"
        :
        : "m" (m64Mul),
          "m" (m64Add)
        : "mm1", "mm2", "memory");
    /*X86_ASM (
        "movd %0, %%mm1\n\t" \
        "pswapd %%mm1, %%mm3\n\t" \
        "pfadd %%mm3, %%mm1\n\t" \
        "movd %1, %%mm2\n\t" \
        "pswapd %%mm2, %%mm3\n\t" \
        "pfadd %%mm3, %%mm2\n\t"
        :
        : "m" (fMul),
          "m" (fAdd)
        : "mm1", "mm2", "mm3", "memory");*/
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "pfadd %%mm2, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pVect[iDataCntr])
            : "0" (m64pVect[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "pfadd %%mm2, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpVect[iDataLength - 1])
            : "0" (fpVect[iDataLength - 1])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
    float fMul, float fAdd, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pDest = (stpm64) fpDest;
    stpm64 m64pSrc = (stpm64) fpSrc;
    stm64 m64Mul;
    stm64 m64Add;

    m64Mul.f[0] = m64Mul.f[1] = fMul;
    m64Add.f[0] = m64Add.f[1] = fAdd;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t" \
        "movq %1, %%mm2\n\t"
        :
        : "m" (m64Mul),
          "m" (m64Add)
        : "mm1", "mm2", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "pfadd %%mm2, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDataCntr])
            : "m" (m64pSrc[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "pfadd %%mm2, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            : "m" (fpSrc[iDataLength - 1])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stpm64 m64pVect = (stpm64) fpVect;
    stm64 m64Add;
    stm64 m64Mul;

    m64Add.f[0] = m64Add.f[1] = fAdd;
    m64Mul.f[0] = m64Mul.f[1] = fMul;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t" \
        "movq %1, %%mm2\n\t"
        :
        : "m" (m64Add),
          "m" (m64Mul)
        : "mm1", "mm2", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "pfmul %%mm2, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pVect[iDataCntr])
            : "0" (m64pVect[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "pfadd %%mm1, %%mm0\n\t" \
            "pfmul %%mm2, %%mm0\n\t" \
            "movd %%mm0, %0\n\t"
            : "=m" (fpVect[iDataLength - 1])
            : "0" (fpVect[iDataLength - 1])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2, 
    int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    float fRes;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stpm64 m64pSrc2 = (stpm64) fpSrc2;

    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "pxor %%mm0, %%mm0\n\t"
        :
        :
        : "mm0");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %0, %%mm1\n\t" \
            "movq %1, %%mm2\n\t" \
            "pfmul %%mm2, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t"
            :
            : "m" (m64pSrc1[iDataCntr]),
              "m" (m64pSrc2[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %0, %%mm1\n\t" \
            "movd %1, %%mm2\n\t" \
            "pfmul %%mm2, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t"
            :
            : "m" (fpSrc1[iDataLength - 1]),
              "m" (fpSrc2[iDataLength - 1])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "pfacc %%mm0, %%mm0\n\t" \
        "movd %%mm0, %0\n\t"
        : "=m" (fRes)
        :
        : "mm0", "memory");
    X86_ASM ("femms\n\t");

    return fRes;
}


void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc, 
    int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    stm64 m64Min;
    stm64 m64Max;
    stpm64 m64pSrc = (stpm64) fpSrc;
    
    m64Min.f[0] = m64Min.f[1] = FLT_MAX;
    m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "movq %0, %%mm1\n\t" \
        "movq %1, %%mm2\n\t"
        :
        : "m" (m64Min),
          "m" (m64Max)
        : "mm1", "mm2", "memory");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %0, %%mm0\n\t" \
            "pfmin %%mm0, %%mm1\n\t" \
            "pfmax %%mm0, %%mm2\n\t"
            :
            : "m" (m64pSrc[iDataCntr])
            : "mm0", "mm1", "mm2", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %0, %%mm0\n\t" \
            "pfmin %%mm0, %%mm1\n\t" \
            "pfmax %%mm0, %%mm2\n\t"
            :
            : "m" (fpSrc[iDataLength - 1])
            : "mm0", "mm1", "mm2", "memory");
    }
    X86_ASM (
        "pswapd %%mm1, %%mm3\n\t" \
        "pfmin %%mm3, %%mm1\n\t" \
        "pswapd %%mm2, %%mm3\n\t" \
        "pfmax %%mm3, %%mm2\n\t" \
        "movd %%mm1, %0\n\t" \
        "movd %%mm2, %1\n\t"
        : "=m" (*fpMin),
          "=m" (*fpMax)
        :
        : "mm1", "mm2", "mm3", "memory");
    X86_ASM ("femms\n\t");
}


float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
    int iDataLength)
{
    int iDataCntr;
    int iDataCount;
    float fRes;
    stpm64 m64pSrc1 = (stpm64) fpSrc1;
    stpm64 m64pSrc2 = (stpm64) fpSrc2;
    
    iDataCount = (iDataLength >> 1);
    X86_ASM (
        "pxor %%mm3, %%mm3\n\t" \
        "pxor %%mm4, %%mm4\n\t" \
        "pxor %%mm5, %%mm5\n\t"
        :
        :
        : "mm3", "mm4", "mm5");
    for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
    {
        X86_ASM (
            "movq %0, %%mm0\n\t" \
            "movq %1, %%mm1\n\t" \
            "movq %%mm1, %%mm2\n\t" \
            "pfmul %%mm0, %%mm2\n\t" \
            "pfacc %%mm2, %%mm5\n\t" \
            "pfmul %%mm0, %%mm0\n\t" \
            "pfacc %%mm0, %%mm3\n\t" \
            "pfmul %%mm1, %%mm1\n\t" \
            "pfacc %%mm1, %%mm4\n\t"
            :
            : "m" (m64pSrc1[iDataCntr]),
              "m" (m64pSrc2[iDataCntr])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "movd %0, %%mm0\n\t" \
            "movd %1, %%mm1\n\t" \
            "movq %%mm1, %%mm2\n\t" \
            "pfmul %%mm0, %%mm2\n\t" \
            "pfacc %%mm2, %%mm5\n\t" \
            "pfmul %%mm0, %%mm0\n\t" \
            "pfacc %%mm0, %%mm3\n\t" \
            "pfmul %%mm1, %%mm1\n\t" \
            "pfacc %%mm1, %%mm4\n\t"
            :
            : "m" (fpSrc1[iDataLength - 1]),
              "m" (fpSrc2[iDataLength - 1])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
    }
    X86_ASM (
        "pfacc %%mm3, %%mm3\n\t" \
        "pfacc %%mm4, %%mm4\n\t" \
        "pfacc %%mm5, %%mm5\n\t" \
        \
        "movd %1, %%mm6\n\t" \
        "pswapd %%mm6, %%mm7\n\t" \
        "paddd %%mm7, %%mm6\n\t" \
        "pi2fd %%mm6, %%mm7\n\t" \
        \
        "pfrcp %%mm7, %%mm6\n\t" \
        "pfrcpit1 %%mm6, %%mm7\n\t" \
        "pfrcpit2 %%mm6, %%mm7\n\t" \
        \
        "pfmul %%mm3, %%mm4\n\t" \
        \
        "movq %%mm4, %%mm0\n\t" \
        "pfrsqrt %%mm4, %%mm1\n\t" \
        "movq %%mm1, %%mm2\n\t" \
        "pfmul %%mm1, %%mm1\n\t" \
        "pfrsqit1 %%mm4, %%mm1\n\t" \
        "pfrcpit2 %%mm2, %%mm1\n\t" \
        "pfmul %%mm1, %%mm4\n\t" \
        \
        "pfmul %%mm6, %%mm4\n\t" \
        \
        "pfrcp %%mm4, %%mm0\n\t" \
        "pfrcpit1 %%mm0, %%mm4\n\t" \
        "pfrcpit2 %%mm0, %%mm4\n\t" \
        \
        "pfmul %%mm6, %%mm5\n\t" \
        "pfmul %%mm4, %%mm5\n\t" \
        "movd %%mm5, %0\n\t"
        : "=m" (fRes)
        : "m" (iDataLength)
        : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
    X86_ASM ("femms\n\t");

    return fRes;
}


void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
    int iIntMax)
{
    int iDataCntr;
    float fScale;
    
    X86_ASM (
        "movd %1, %%mm1\n\t" \
        "pswapd %%mm1, %%mm2\n\t" \
        "paddd %%mm2, %%mm1\n\t" \
        "pi2fd %%mm1, %%mm1\n\t" \
        "pfrcp %%mm1, %%mm2\n\t" \
        "pfrcpit1 %%mm2, %%mm1\n\t" \
        "pfrcpit2 %%mm2, %%mm1\n\t" \
        "movd %%mm1, %0\n\t"
        : "=m" (fScale)
        : "m" (iIntMax)
        : "mm1", "mm2", "memory");
    for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
    {
        X86_ASM (
            "movd %1, %%mm0\n\t" \
            "punpcklwd %%mm0, %%mm0\n\t" \
            "pi2fw %%mm0, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (fpDest[iDataCntr])
            : "m" (ipSrc[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
    if ((iDataLength % 2) != 0)
    {
        fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
    }
}


void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
    int iIntMax)
{
    int iDataCntr;
    float fScale;
    
    X86_ASM (
        "movd %1, %%mm1\n\t" \
        "pswapd %%mm1, %%mm2\n\t" \
        "paddd %%mm2, %%mm1\n\t" \
        "pi2fd %%mm1, %%mm1\n\t" \
        "pfrcp %%mm1, %%mm2\n\t" \
        "pfrcpit1 %%mm2, %%mm1\n\t" \
        "pfrcpit2 %%mm2, %%mm1\n\t" \
        "movd %%mm1, %0\n\t"
        : "=m" (fScale)
        : "m" (iIntMax)
        : "mm1", "mm2", "memory");
    for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
    {
        X86_ASM (
            "movq %1, %%mm0\n\t" \
            "pi2fd %%mm0, %%mm0\n\t" \
            "pfmul %%mm1, %%mm0\n\t" \
            "movntq %%mm0, %0\n\t"
            : "=m" (fpDest[iDataCntr])
            : "m" (ipSrc[iDataCntr])
            : "mm0", "mm1", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
    if ((iDataLength % 2) != 0)
    {
        fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
    }
}


void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength, 
    const float *fpCoeff, int iCoeffLength)
{
    int iSrcCntr;
    int iDestCntr;
    int iCoeffCntr;
    int iSrcCount;
    stpm64 m64pDest = (stpm64) fpDest;

    iDestCntr = 0;
    iSrcCount = iDataLength + iCoeffLength;
    for (iSrcCntr = iCoeffLength; 
        iSrcCntr < iSrcCount; 
        iSrcCntr += 2)
    {
        X86_ASM (
            "pxor %%mm0, %%mm0\n\t" 
            :
            :
            : "mm0");
        for (iCoeffCntr = 0; 
            iCoeffCntr < iCoeffLength;
            iCoeffCntr++)
        {
            X86_ASM (
                "movq %0, %%mm1\n\t" \
                "movd %1, %%mm2\n\t" \
                "pswapd %%mm2, %%mm3\n\t" \
                "pfadd %%mm3, %%mm2\n\t" \
                "pfmul %%mm2, %%mm1\n\t" \
                "pfadd %%mm1, %%mm0\n\t" 
                :
                : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
                  "m" (fpCoeff[iCoeffCntr])
                : "mm0", "mm1", "mm2", "mm3", "memory");
        }
        X86_ASM (
            "movntq %%mm0, %0\n\t"
            : "=m" (m64pDest[iDestCntr++])
            :
            : "mm0", "memory");
    }
    if (iDataLength & 0x1)
    {
        X86_ASM (
            "pxor %%mm0, %%mm0\n\t" 
            :
            :
            : "mm0");
        for (iCoeffCntr = 0; 
            iCoeffCntr < iCoeffLength;
            iCoeffCntr++)
        {
            X86_ASM (
                "movd %0, %%mm1\n\t" \
                "movd %1, %%mm2\n\t" \
                "pfmul %%mm2, %%mm1\n\t" \
                "pfadd %%mm1, %%mm0\n\t" 
                :
                : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
                  "m" (fpCoeff[iCoeffCntr])
                : "mm0", "mm1", "mm2", "memory");
        }
        X86_ASM (
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataLength - 1])
            :
            : "mm0", "memory");
    }
    X86_ASM (
        "femms\n\t" \
        "sfence\n\t");
}


void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff, 
    float *fpX, float *fpY)
{
    int iDataCntr;
    stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
    stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
    stpm64 m64pX = (stpm64) fpX;
    stpm64 m64pY = (stpm64) fpY;

    X86_ASM (
        "movq %0, %%mm0\n\t" \
        "pswapd %%mm0, %%mm2\n\t" \
        "movd %1, %%mm3\n\t" \
        "movq %2, %%mm0\n\t" \
        "pswapd %%mm0, %%mm4\n\t" \
        "movq %3, %%mm5\n\t" \
        "movq %4, %%mm7\n\t" \
        :
        : "m" (*m64pCoeff),
          "m" (fpCoeff[0]),
          "m" (*m64pCoeff2),
          "m" (*m64pX),
          "m" (*m64pY)
        : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
    for (iDataCntr = 0; 
        iDataCntr < iDataLength; 
        iDataCntr++)
    {
        X86_ASM (
            "pxor %%mm0, %%mm0\n\t" \
            "movd %1, %%mm6\n\t" \
            "movq %%mm5, %%mm1\n\t" \
            "pfmul %%mm2, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "movq %%mm6, %%mm1\n\t" \
            "pfmul %%mm3, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "movq %%mm7, %%mm1\n\t" \
            "pfmul %%mm4, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "pfacc %%mm0, %%mm0\n\t" \
            \
            "pswapd %%mm7, %%mm1\n\t" \
            "movq %%mm1, %%mm7\n\t" \
            "punpckldq %%mm0, %%mm7\n\t" \
            \
            "pswapd %%mm5, %%mm1\n\t" \
            "movq %%mm1, %%mm5\n\t" \
            "movq %%mm6, %%mm1\n\t" \
            "punpckldq %%mm1, %%mm5\n\t" \
            \
            "movd %%mm0, %0\n\t"
            : "=m" (fpVect[iDataCntr])
            : "0" (fpVect[iDataCntr])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
    }
    X86_ASM (
        "movq %%mm5, %0\n\t" \
        "movd %%mm6, %1\n\t" \
        "movq %%mm7, %2\n\t"
        : "=m" (*m64pX),
          "=m" (fpX[2]),
          "=m" (*m64pY)
        :
        : "mm5", "mm6", "mm7", "memory");
    X86_ASM ("femms\n\t");
}


void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength, 
    const float *fpCoeff, float *fpX, float *fpY)
{
    int iDataCntr;
    stpm64 m64pCoeff = (stpm64) &fpCoeff[1];
    stpm64 m64pCoeff2 = (stpm64) &fpCoeff[3];
    stpm64 m64pX = (stpm64) fpX;
    stpm64 m64pY = (stpm64) fpY;

    X86_ASM (
        "movq %0, %%mm0\n\t" \
        "pswapd %%mm0, %%mm2\n\t" \
        "movd %1, %%mm3\n\t" \
        "movq %2, %%mm0\n\t" \
        "pswapd %%mm0, %%mm4\n\t" \
        "movq %3, %%mm5\n\t" \
        "movq %4, %%mm7\n\t" \
        :
        : "m" (*m64pCoeff),
          "m" (fpCoeff[0]),
          "m" (*m64pCoeff2),
          "m" (*m64pX),
          "m" (*m64pY)
        : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
    for (iDataCntr = 0; 
        iDataCntr < iDataLength; 
        iDataCntr++)
    {
        X86_ASM (
            "pxor %%mm0, %%mm0\n\t" \
            "movd %1, %%mm6\n\t" \
            "movq %%mm5, %%mm1\n\t" \
            "pfmul %%mm2, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "movq %%mm6, %%mm1\n\t" \
            "pfmul %%mm3, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "movq %%mm7, %%mm1\n\t" \
            "pfmul %%mm4, %%mm1\n\t" \
            "pfacc %%mm1, %%mm0\n\t" \
            "pfacc %%mm0, %%mm0\n\t" \
            \
            "pswapd %%mm7, %%mm1\n\t" \
            "movq %%mm1, %%mm7\n\t" \
            "punpckldq %%mm0, %%mm7\n\t" \
            \
            "pswapd %%mm5, %%mm1\n\t" \
            "movq %%mm1, %%mm5\n\t" \
            "movq %%mm6, %%mm1\n\t" \
            "punpckldq %%mm1, %%mm5\n\t" \
            \
            "movd %%mm0, %0\n\t"
            : "=m" (fpDest[iDataCntr])
            : "m" (fpSrc[iDataCntr])
            : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
    }
    X86_ASM (
        "movq %%mm5, %0\n\t" \
        "movd %%mm6, %1\n\t" \
        "movq %%mm7, %2\n\t"
        : "=m" (*m64pX),
          "=m" (fpX[2]),
          "=m" (*m64pY)
        :
        : "mm5", "mm6", "mm7", "memory");
    X86_ASM ("femms\n\t");
}


#ifdef __cplusplus
}
#endif

#endif
