/*
 * Copyright (C) 1995	Edward Der-Hua Liu, Hsin-Chu, Taiwan
 *  text table -> binary table
 * example: tsa2d tsin.src
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>

typedef struct {
    u_char          ch[2];
    u_short         ph;
}               ITEM;

int             hash[256];

static char    *pk[] = {
    "  tuvwxyz{|}~",
    "  ",
    "  ",
    "  "
};

int            *phidx, *sidx, phcount;
int             bfsize, phidxsize;
u_char         *bf;
u_char         *sf;

int qcmp(int *a, int *b)
{
    int             idxa = *a;
    int             idxb = *b;
    int             lena, lenb, len, cha, chb;
    int             i;
    u_short         ka, kb;

    lena = bf[idxa++];
    lenb = bf[idxb++];
    cha = idxa + lena * 2;
    chb = idxb + lenb * 2;
    len = (lena < lenb ? lena : lenb);
    for (i = 0; i < len; i++) {
	memcpy(&ka, &bf[idxa], 2);
	memcpy(&kb, &bf[idxb], 2);
	if (ka > kb)
	    return 1;
	if (kb > ka)
	    return -1;
	idxa += 2;
	idxb += 2;
    }
    if (lena > lenb)
	return 1;
    if (lena < lenb)
	return -1;
    return memcmp(&bf[cha], &bf[chb], lena * 2);
}

static int      shiftb[] = {9, 7, 3, 0};

int lookup(u_char * s)
{
    int             i;
    char            tt[3], *pp;


    if (*s < 128)
	return *s - '0';
    tt[0] = s[0];
    tt[1] = s[1];
    tt[2] = 0;
    for (i = 0; i < 3; i++)
	if (pp = strstr(pk[i], tt))
	    break;
    if (i == 3)
	return 0;
    return (((pp - pk[i]) >> 1) << shiftb[i]);
}

void 
prph(u_short kk)
{
    u_int           k1, k2, k3, k4;
    k4 = (kk & 7) << 1;
    kk >>= 3;
    k3 = (kk & 15) << 1;
    kk >>= 4;
    k2 = (kk & 3) << 1;
    kk >>= 2;
    k1 = (kk & 31) << 1;
    printf("%c%c%c%c%c%c%c%c",
	   pk[0][k1], pk[0][k1 + 1],
	   pk[1][k2], pk[1][k2 + 1],
	   pk[2][k3], pk[2][k3 + 1],
	   pk[3][k4], pk[3][k4 + 1]);
}


int main(int argc, char **argv)
{
    FILE           *fp, *fw;
    u_char          s[1024];
    u_char          chbuf[80][2];
    u_short         phbuf[80];
    int             i, j, idx, len, ofs;
    u_short         kk;
    int             hashidx[256];
    u_char          clen;

    if (argc > 1) {
	if ((fp = fopen(argv[1], "r")) == NULL) {
	    printf("Cannot open %s\n", argv[1]);
	    exit(-1);
	}
    } else
	fp = stdin;

    bfsize = 300000;
    if (!(bf = (u_char *) malloc(bfsize))) {
	puts("malloc err");
	exit(1);
    }
    phidxsize = 18000;
    if (!(phidx = (int *)malloc(phidxsize * 4))) {
	puts("malloc err");
	exit(1);
    }
    phcount = ofs = 0;
    while (!feof(fp)) {
	fgets(s, sizeof(s), fp);
	len = strlen(s);
	if (s[len - 1] == '\n')
	    s[--len] = 0;
	if (len == 0)
	    continue;
	j = i = 0;
	while (s[i] != ' ' && i < len) {
	    memcpy(chbuf[j], &s[i], 2);
	    i += 2;
	    j++;
	}
	i++;
	j = 0;
	while (i < len) {
	    kk = 0;
	    while (s[i] != ' ' && i < len) {
		kk |= lookup(&s[i]);
		if (s[i] & 128)
		    i += 2;
		else
		    i++;
	    }
	    i++;
	    phbuf[j++] = kk;
	}
	clen = j;
	/* printf("len:%d\n", clen); */
	phidx[phcount++] = ofs;
	memcpy(&bf[ofs++], &clen, 1);
	memcpy(&bf[ofs], phbuf, clen * 2);
	ofs += clen * 2;
	memcpy(&bf[ofs], chbuf, (int)clen * 2);
	ofs += clen * 2;
	if (ofs + 100 >= bfsize) {
	    bfsize += 65536;
	    if (!(bf = (u_char *) realloc(bf, bfsize))) {
		puts("realloc err");
		exit(1);
	    }
	}
	if (phcount + 100 >= phidxsize) {
	    phidxsize += 1000;
	    if (!(phidx = (int *)realloc(phidx, phidxsize * 4))) {
		puts("realloc err");
		exit(1);
	    }
	}
    }
    fclose(fp);

    /* dumpbf(bf,phidx); */

    puts("Sorting ....");
    qsort(phidx, phcount, 4, (void *)qcmp);

    if (!(sf = (u_char *) malloc(bfsize))) {
	puts("malloc err");
	exit(1);
    }
    if (!(sidx = (int *)malloc(phidxsize * 4))) {
	puts("malloc err");
	exit(1);
    }
    ofs = 0;
    j = 0;
    bzero(s, sizeof(s));
    for (i = 0; i < phcount; i++) {
	idx = phidx[i];
	sidx[j] = ofs;
	len = bf[idx];
	clen = 4 * len + 1;
	if (memcmp(s, &bf[idx], clen)) {
	    memcpy(&sf[ofs], &bf[idx], clen);
	    memcpy(s, &bf[idx], clen);
	} else
	    continue;
	j++;
	ofs += clen;
    }

    phcount = j;


    for (i = 0; i < 256; i++)
	hashidx[i] = -1;

    for (i = 0; i < phcount; i++) {
	u_short         kk, jj;

	idx = sidx[i];
	idx++;
	memcpy(&kk, &sf[idx], 2);
	jj = kk;
	kk >>= 6;
	if (hashidx[kk] < 0) {
	    /* prph(jj); */
	    hashidx[kk] = i;
	    /* printf(" kk:%d i:%d\n", kk, i);  */
	}
    }

    if (hashidx[0] == -1)
	hashidx[0] = 0;
    hashidx[255] = phcount;
    for (i = 254; i >= 0; i--)
	if (hashidx[i] == -1)
	    hashidx[i] = hashidx[i + 1];
    for (i = 1; i < 256; i++)
	if (hashidx[i] == -1)
	    hashidx[i] = hashidx[i - 1];

    puts("Writing data");
    if ((fw = fopen("tsin", "w")) == NULL) {
	puts("create err");
	exit(-1);
    }
    fwrite(sf, 1, ofs, fw);
    fclose(fw);

    if ((fw = fopen("tsin.idx", "w")) == NULL) {
	puts("create err");
	exit(-1);
    }
    fwrite(&phcount, 4, 1, fw);
    fwrite(hashidx, 1, sizeof(hashidx), fw);
    fwrite(sidx, 4, phcount, fw);
    printf("%d phrases\n", phcount);

    fclose(fw);
    exit(0);
}
