#include "rend-x.h"
//#define DEBUG 12

#ifndef DEBUG
#define LEFT_EDGE24 0x00ff00
#define RIGHT_EDGE24 0x00ff00
#define BOTTOM_EDGE24 0x0000ff
#else
/* Colors to use for left and right edges of image :
   handy to set these to something like red and green
   for debugging : have to know r/g/b masks though */
#define LEFT_EDGE24 0x000000
#define RIGHT_EDGE24 0x000000
#endif

static void generic_render(Gfx *);
//static void render32(Gfx *);
static void render24(Gfx *);
static void render16(Gfx *);
static void render8 (Gfx *);

/*
 * the 8 32-bit registers `%eax' (the accumulator), `%ebx', `%ecx',
 `%edx', `%edi', `%esi', `%ebp' (the frame pointer), and `%esp'
 (the stack pointer).
 
 * the 8 16-bit low-ends of these: `%ax', `%bx', `%cx', `%dx', `%di',
 `%si', `%bp', and `%sp'.
 
 * the 8 8-bit registers: `%ah', `%al', `%bh', `%bl', `%ch', `%cl',
 `%dh', and `%dl' (These are the high-bytes and low-bytes of `%ax',
 `%bx', `%cx', and `%dx')
*/

// This is all about the "cmap" variable, and how if one were to
// write this entire function in assembler then it could be made
// a bit quicker - possibly..
// have a look at Old_Glurbules_Shite/Blobs2.devel.stuff/Old/Junk/rend.s
// for reference of stuff I wrote earlier, doesn't work though I don't think!
  // cmap isn't in a register , but I think it could be done
  // in the stack or frame pointer, or at least if the stack
  // was prefilled with the 256 values from the cmap  and then
  // another varibale or two can be chucked on 256 further down
  // otherwise the address of things is being found on the stack
  // and it's then being looked ,i.e. cmap is stored at -4(ebp)
  // so -4(ebp) is moved into some register say edx as it's
  // the only register not in use, and then (edx,eax,4) is looked up
  // to be stored back in a register and then stored into the dest
  // image which is current esi.
  // i.e.
  //        movl -4(%ebp),%edx     // GCC likes to use ebp for stacks ?!
  //        movl (%edx,%eax,4),%eax
  //        movl %eax,(%edi)
  //        addl $4,%edi
  // if the stack pointer was in the right place this could be
  //     movl (%ebp, %eax, 4), %eax
  //       // Not sure if that'll work because it might
  //       // need to be -%eax if that's possible (or 
  //       // ebp just needs to be inited the other way round
  //       // and data poured in backwards and ebp set 256 off)
  //     movl %eax, (%edi)
  //     addl $4, %edi
  // can the last two could be a stosb or something ??
  // 
  // later this means that when we need a variable off the stack,
  // say what would have been at -8(ebp) ...
  // movl -8(%ebp), %edx
  // could become
  // movl (%ebp, 260, 4), %edx
  // instead.. 
  // or perhaps the colormap should go on the stack before
  // any varibles ?? dunno really.
  // check the output of this with :
  //
  // cc -O -mfancy-math-387 -S rend-x.c && more rend-x.s



void render32(Gfx *g) {
#define IMGDATATYPE unsigned int
  unsigned long  *cmap asm ("esp") = g->simple_map; // not in a register
  register IMGDATATYPE *edi  asm ("edi")  = (IMGDATATYPE *)g->xim->data;   // think.. EDI = errr destination image.
  register unsigned char *ibuff asm ("esi")= g->cbuffer;  // think.. ESI = errr source image
  int       y = g->xim->height - 1;  // THIS ISN'T IN A REGISTER.. WE'VE RUN OUT OF THEM! =(
  register int   width asm ("ecx") = g->xres;
  register int     ebx asm ("ebx") = width-2;

  ibuff--; // because these really are +1 at start and end of line this
  edi--;  // just lets it do a +2 at the start of each horizontal loop  

  do {
    ibuff += 2;
    ebx    = width;
    edi   += 2;
    ebx   -= 2;
    do { // this loop uses: eax, ibuff, line, cmap, width
      register int eax asm ("eax");
      eax  = *ibuff;
      eax += *(ibuff + width - 1);
      eax += *(ibuff + width);
      eax += *(ibuff + width + 1);
      eax >>= 2;
      *ibuff++ = eax;
      *edi++  = cmap[eax];
      } while (--ebx);
  } while (--y);

  edi++; ibuff++; ebx=width;

  do {
    *edi++ = BOTTOM_EDGE24;
    *ibuff++>>=1;
  } while (--ebx) ;
}


#if 0
static void six_var_render32(Gfx *g) {
#define IMGDATATYPE unsigned int
  unsigned long  *cmap = g->simple_map;
  IMGDATATYPE    *line = (IMGDATATYPE *)g->xim->data; 
  unsigned char *ibuff = g->cbuffer;
  register int       y = g->xim->height - 1;
  register int   width = g->xres;
  register int       t = ((width-2)&0xffff) | (0xffff0000 & (y<<16));

  ibuff--; // because these really are +1 at start and end of line this
  line--;  // just lets it do a +2 at the start of each horizontal loop  

  while (t) {
    ibuff+=2;
    line+=2;
    t-=0x10000;
    t&=0xffff0000;
    t|=(width-2);
    while (t&0xffff) {
      register int     eax;
      t--;
      eax  = *ibuff;
      eax += *(ibuff + width);
      eax += *(ibuff - 1 + width);
      eax += *(ibuff + 1 + width);
      eax >>= 2;
      *ibuff++ = eax;
      *line++  = cmap[eax];
    }
  }

  line++;
  ibuff++;
  t=width;
  while (t--) {
    *line++ = BOTTOM_EDGE24;
    *ibuff++>>=1;
  }

}


static void s_render32(Gfx *g) {
  // Still needs the following variables :
  // cmap, line, ibuff, y, width
  // inloops it also needs
  // eax, x
  // 7 variables.
  // too many!
  // should be able to get this down to 6 at least.
  // as x and width are very closely related.

#define IMGDATATYPE unsigned int
  unsigned long  *cmap = g->simple_map;
  IMGDATATYPE    *line = (IMGDATATYPE *)g->xim->data; 
  unsigned char *ibuff = g->cbuffer;
  register int       y = g->xim->height - 1;
  register int   width = g->xres;
  register int       x = width - 2;
  register int       t = y * (width -2);
  ibuff--; // because these really are +1 at start and end of line this
  line--;  // just lets it do a +2 at the start of each horizontal loop  

  while(y--) {
    x = width-2;
    ibuff+=2;
    line+=2;
    printf("t is %d\n",t);
    while (x--) {
      register int eax;
      t--;
      eax  =  *ibuff;
      eax += *(ibuff + width);
      eax += *(ibuff - 1 + width);
      eax += *(ibuff + 1 + width);
      eax >>= 2;
      *ibuff++ = eax;
      *line++  = cmap[eax];
    }
  }

  printf("t ends on is %d\n",t);
  line++;
  ibuff++;
  x=width;
  while (x--) {
    *line++ = BOTTOM_EDGE24;
    *ibuff++>>=1;
  }

}
#endif

static void generic_render(Gfx *g) {
  /* This is 'quite' slow really!! */
  int x,y;
  unsigned char *buff = g->cbuffer;
  int acc =0;
#ifdef DEBUG
  printf("Using default unaccelerated (poor performance) render routine\n");
#endif
  for( y = 0; y < (g->yres -1 ) ; y++) {
    for( x = 1; x < (g->xres - 1); x++) {
      /* This is a poor fire effect really */
      /* for a start it's munging itself up */
      /* i.e. writing over bits of this frame that */
      /* it will use again before it's finished this frame */
      /* This could probably be fixed quite easily. */
      acc= (buff[-1]+buff[0]+buff[1]+buff[g->xres])>>2;
      *buff++ = acc;
      XPutPixel(g->xim, x, y, g->simple_map[acc]);
    }
    buff+=2; /* skip first and last pixels on a line. */
    /* why.. they are zero. */
  }

  for (x=1; x< g->xres;x++) {(*buff++)=buff[-*buff];}
}


static void render24(Gfx *g) {
#ifdef DEBUG
  printf("FIXME: accelerated packed 24 bits per pixel mode is not supported\n");
#endif
  generic_render(g);
}

static void render16(Gfx *g) {
#ifdef DEBUG
  printf("FIXME: accelerated 16 bits per pixel mode is not supported\n");
#endif
  generic_render(g);
}

static void render8(Gfx *g) {
#ifdef DEBUG
  printf("FIXME: accelerated 8 bits per pixel mode is not supported\n");
#endif
  generic_render(g);
}

void render_blurred_buffer(Gfx *g) {
  switch (g->xim->bits_per_pixel) {
  case 32: render32(g); break;
  case 24: render24(g); break;
  case 16: render16(g); break;
  case  8: render8(g); break;
  default: generic_render(g); break;
  }
}
