#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <xl_pinyin.h>
#include "safestring.h"

typedef struct _HzPhrase
{
   u_char hz[MAX_PHRASE_LEN*2+1];
   u_char freq;
   struct _HzPhrase *next;
}HzPhrase;

typedef struct _KeyPhrase
{
   u_short len;
   u_char key[2*MAX_PHRASE_LEN+1];
   int  count;   // number of Phrase items, in file it should be u_char
   HzPhrase *hzph;
   struct _KeyPhrase *next;
}KeyPhrase, *PKeyPhrase;

PinYin pytab[26][MAX_EACH_PY];
PKeyPhrase phtab[MAX_PY_NUM];
u_short phcount[MAX_PY_NUM];

//Get PYahead for key
u_short KeyAhead(u_char *key)
{
	u_short py;
	py = '\0';
	py |= key[0] << 2 & 0x3ff;
	py |= key[1] >> 6;
	return py;
}

//Convert Phrase_Key to PY_Key
void Key2PYKey(u_char *key, u_short *pykey, u_char len)
{
        int i,j,p;

        for (i=0; i<len; i++) pykey[i] = '\0';

        j=p=0;
        for (i=0; i<len; i++){
                pykey[i] |= (key[j++] << (2 + p)) & 0x3ff;
                pykey[i] |= key[j] >> (6 - p);
                p += 2;
                if ( (p %= 8) == 0 ) j++;
        }

}


//Convert PY_Key to Phrase_Key
void PYKey2Key(u_char *key, u_short *pykey, u_char len)
{
        int i,j,p;
        int klen=(len*10)/8 +1;

        for (j=0; j<klen; j++) key[j] = '\0';

        j=p=0;
        for (i=0; i< len; i++){
                key[j++] |= (pykey[i] & 0x3ff) >> (2 + p);
                key[j] |= pykey[i] << (6 - p) ;
                p += 2;
                if ( (p %= 8) == 0 )  j++;
        }
}


int LoadTable(char* pathname)
{
  FILE *stream;
  char str[1024], *strpy, *strhz;
  int i=1, j=0, lastpy=0, curpy;

  if ( (stream = fopen( pathname, "r" )) == NULL )
  {
    fprintf(stderr,"%s file not found\n",pathname);
    exit(1);
  }

  while ( !feof( stream ))
  {
    if ( fgets(str,1024,stream) != NULL)
    {
      strpy = strtok(str, " \f\n\r\t\v");
      strhz = strtok(NULL, " \f\n\r\t\v");

      curpy = strpy[0]-'a';
      if (curpy != lastpy) j = 0;
      safe_strncpy(pytab[curpy][j].py, strpy, MAX_PY_LEN);
      pytab[curpy][j].key = i;
      lastpy = curpy;
      i++,j++;
    }
  }
  fclose(stream);
  return 0;
}

/* divide the string strbuf into string arrays according to space and Tab */

int String2Array(char *strbuf,int len, char strarr[][len])
{
  int i=0, cursor=0, count=0, buflen = strlen(strbuf);

  while(i < buflen)
  {
     while(i < buflen && (strbuf[i] == ' ' || strbuf[i] == '\011'))
             i++;   // skip space
     cursor = i;
     while(i < buflen && strbuf[i] != ' ' && strbuf[i] != '\011')
              i++;  // skip non-space
     if (i > cursor)
     {
       strncpy(strarr[count],strbuf+cursor,i-cursor);
       strarr[count++][i-cursor] = '\0';
     }
  }
  return count;
}

int hzlen[10];
int SavePhraseToMem(char *str,u_char *key,u_char len,u_char freq)
{
  PKeyPhrase kph,tmpkph;
  HzPhrase  *hzph;
  int first;
  short ahead;

  if (len<1) return 0;
   /* single char phrase ignored */
  if (len > MAX_PHRASE_LEN) {
      fprintf(stderr, "buffer overrun\n");
      abort();
  }

  ahead = (short)KeyAhead(key);

  kph = phtab[ahead];
  if (kph != NULL)   // first phrase of this pinyin
  {
    first=1; 
    do
    {
      if (first) first = 0; 
      else kph = kph->next;

      /* find the matched pinyin keyphrase */
      if (kph->len == len && !memcmp(kph->key,key,KEYLEN(len)))
      {
        for(hzph = kph->hzph; hzph != NULL; hzph = hzph->next)
           if (len!=1&&!memcmp(hzph->hz,str,2*len))  // same phrase
           {
               fprintf(stderr,"Duplicate phrase %s detected, ignored!\n",
                  hzph->hz);
               return 0; 
           }

        hzph = kph->hzph;
        while(hzph->next != NULL)
          hzph = hzph->next;  // reach the end of the link list

        if ((hzph->next = (HzPhrase *)malloc(sizeof(HzPhrase)))==NULL)
        {
           fprintf(stderr,"no enough memory\n");
           exit(1);
        }
        kph->count++;
        hzph = hzph->next;
        hzph->freq = freq;
        hzph->next = NULL;
        memcpy(hzph->hz,str,len*2);	/* len < MAX_PHRASE_LEN */
        hzph->hz[len*2] = '\0';
        return 1;   // insert a new Hanzi Phrase at the end of the link list
      }
    }while(kph->next != NULL);
 }

 // not found , no matched pinyin keyphrase, allocate a new one
 if ((tmpkph = (KeyPhrase *)malloc(sizeof(KeyPhrase))) == NULL)
 {
     fprintf(stderr,"no enough memory\n");
     exit(1);
 }
 if (phtab[ahead] == NULL) 
    phtab[ahead] = tmpkph;
 else 
    kph->next = tmpkph;
 tmpkph->len = len;
 tmpkph->count = 1;
 memcpy(tmpkph->key,key,KEYLEN(len));	/* len < MAX_PHRASE_LEN */
 tmpkph->next = NULL;

 if ((tmpkph->hzph = (HzPhrase *)malloc(sizeof(HzPhrase))) == NULL)
 {
    fprintf(stderr,"no enough memory\n");
    exit(1);
 }
 tmpkph->hzph->freq = freq;
 tmpkph->hzph->next = NULL;
 memcpy(tmpkph->hzph->hz,str,len*2);	/* len < MAX_PHRASE_LEN */
 tmpkph->hzph->hz[len*2] = '\0';
 phcount[ahead]++;

 hzlen[len]++;
 return 1;
}

int max_count = 0;
int file_size = 0;

int SavePhraseToFile(char *pathname)
{
  KeyPhrase *kph,*kphtmp;
  HzPhrase *hzph,*hzphtmp;
  u_short j;
  int k;
  u_char key[MAX_KEY_LEN],freq;
  u_short len,count,size;
  FILE *out;

  if ((out = fopen( pathname, "wb" )) == NULL)
  {
        fprintf(stderr,"%s cant open.\n",pathname);
        exit(1);
  }

  for (j=1; j<MAX_PY_NUM; j++)
  {
      kph = phtab[j];  
      file_size += 2; //u_short

      fwrite(&(phcount[j]),sizeof(phcount[j]),1,out);

      while (kph != NULL)
      {
        hzph = kph->hzph;
        kphtmp = kph;
        kph = kph->next;

        len = kphtmp->len;
	if (len > MAX_PHRASE_LEN) {
	    fprintf(stderr, "buffer overrun\n");
	    abort();
	}
        memcpy(key,kphtmp->key,KEYLEN(len));
        fwrite(&len,sizeof(len),1, out);
      
        size = kphtmp->count;
        if (kphtmp->count > max_count) max_count = kphtmp->count;

        if (kphtmp->count > 65535)
        {
           fprintf(stdout,"Phrase Count = %d > 255, error!!!\n",kphtmp->count);
           exit(1);
        }  
        fwrite(&size,sizeof(size),1,out);
        fwrite(key,sizeof(u_char),KEYLEN(len),out);

        if (kphtmp->count > max_count) max_count = kphtmp->count;
     /* len, key[len+1], count, phrase, freq , phrase, freq ...*/

        file_size += SizeOfPhrase(len,kphtmp->count);

        while (hzph != NULL)
        {
          hzphtmp = hzph;
          hzph = hzph->next;

          fwrite(hzphtmp->hz,sizeof(char),len*2,out);
          fwrite(&(hzphtmp->freq),sizeof(hzphtmp->freq),1,out);
          free(hzphtmp);
        }
        free(kphtmp);
      }
    }

    fwrite(&file_size,sizeof(file_size),1,out);
    printf("File size=%d\n\n",file_size);
    fclose(out);
    return 1;
}

int LoadPhraseFromFile(char *pathname)
{
  FILE *stream;
  int i,j;
  char str[250];
  u_short len;
  u_char key[MAX_KEY_LEN];
  unsigned short pykey[MAX_PHRASE_LEN];
  int count,ahead,flag = 0,freq;
  char strarr[MAX_PHRASE_LEN+4][2*MAX_PHRASE_LEN+1];

  if ( (stream = fopen( pathname, "r" )) == NULL )
  {
     fprintf(stderr,"%s cant open.\n",pathname);
     exit(1);
  }

  while ( !feof( stream )) 
  {
     if ( fgets(str,250,stream) != NULL) 
     {
	 if (str[0]=='#') {
		 printf("commence:%s",str);
		 continue;
	 }
         str[strlen(str)-1] = '\0';
         count = String2Array(str,2*MAX_PHRASE_LEN+1,strarr);
         len = strlen(strarr[0])/2;
         /* len+1 = count, freq = 0
            len+2 = count, freq = xx */
         if ((len != count-1 && len != count-2) || len > MAX_PHRASE_LEN)
         {
      	     fprintf(stderr,"Phrase %s error!\n",str);
 	     continue;
         }
         
         if (len == count-2)
         {
            freq = atoi(strarr[count-1]);
            if (freq > 255) freq = 255;
            count--;
         }
         else freq = 0;

         for (i=1; i<count; i++)
         {
      	     ahead = (int)strarr[i][0]-'a';
	     flag = 0;
	     if (ahead<0 || ahead>25)
             {
	         fprintf(stderr,"Phrase %s error!!!\n",str);
	         break;
	     }

	     for(j=0; pytab[ahead][j].key; j++)
             {
	        if (!strcmp(pytab[ahead][j].py,strarr[i]))
                {
	           pykey[i-1] = pytab[ahead][j].key;
	           flag = 1;
	           break;
	        }
	    }
	    if (!flag) break;
        } // for

        if (!flag)
        {
	   fprintf(stderr,"Phrase %s !error!!!\n",str);
	   continue;
        }
        PYKey2Key(key, pykey, len);

        /*
        printf("%s, len=%d, key0=%d, key1 =%d, key=%d\n",
         str,len,(int)key[0],(int)key[1],(int)key[2]);
        */

        SavePhraseToMem(str,key,len,freq);
    }
  }
   
  fclose(stream);
  return (0);
}

int main(int argc,char **argv)
{
  int i,total = 0;

  if (argc != 5) 
  {
    fprintf(stderr,"usage: %s <py_map> <input_name_char> <input_name_phrase> <output_name>\n",argv[0]);
    return 1;
  }
  for(i = 0; i < MAX_PY_NUM; i++)
  {
    phtab[i] = NULL;
    phcount[i] = 0;
  }
  LoadTable(argv[1]);
  LoadPhraseFromFile(argv[2]);
  LoadPhraseFromFile(argv[3]);
  SavePhraseToFile(argv[4]);

  return 0;
}

