/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>
#include <math.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_uniconv.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_xmalloc.h"
#include "udm_spell.h"
#include "udm_stopwords.h"
#include "udm_word.h"
#include "udm_vars.h"
#include "udm_db.h"
#include "udm_db_int.h"
#include "udm_url.h"
#include "udm_hash.h"
#include "udm_parsehtml.h"
#include "udm_store.h"
#include "udm_doc.h"
#include "udm_conf.h"
#include "udm_result.h"
#include "udm_log.h"
#include "udm_sgml.h"
#include "udm_mutex.h"
#include "udm_chinese.h"
#include "udm_synonym.h"
#include "udm_sqldbms.h"

#ifdef CHASEN
#include <chasen.h>
#endif

#ifdef MECAB
#include <mecab.h>
#endif

/*
#define DEBUG_CACHE
*/

/********** QSORT functions *******************************/

static int cmpword(UDM_URL_CRD *s1,UDM_URL_CRD *s2)
{
  if (s1->coord > s2->coord) return -1;
  if (s1->coord < s2->coord) return 1;
  if (s1->url_id > s2->url_id) return 1;
  if (s1->url_id < s2->url_id) return -1;
  return 0;
}

static int cmpurlid (UDM_URL_CRD *s1, UDM_URL_CRD *s2)
{
  if (s1->url_id > s2->url_id) return(1);
  if (s1->url_id < s2->url_id) return(-1);
  return(UDM_WRDPOS(s1->coord) - UDM_WRDPOS(s2->coord));
}


static int cmppattern(UDM_URLCRDLIST *L, UDM_URL_CRD *C, UDM_URLDATA *D,
                      long j, const char *pattern)
{
  int rc;

  for(; *pattern != '\0'; pattern++)
  {
    switch(*pattern)
    {
      case 'R':
      case 'r':
        if (C->coord > L->Coords[j].coord) return (*pattern == 'R') ? 1 : -1;
        if (C->coord < L->Coords[j].coord) return (*pattern == 'R') ? -1 : 1;
        break;
      case 'P':
      case 'p':
        if (D->pop_rank > L->Data[j].pop_rank) return (*pattern == 'P') ? 1 : -1;
        if (D->pop_rank < L->Data[j].pop_rank) return (*pattern == 'P') ? -1 : 1;
        break;
      case 'D':
      case 'd':
        if (D->last_mod_time > L->Data[j].last_mod_time) return (*pattern == 'D') ? 1 : -1;
        if (D->last_mod_time < L->Data[j].last_mod_time) return (*pattern == 'D') ? -1 : 1;
        break;
      case 'U':
      case 'u':
        rc= strcmp(UDM_NULL2EMPTY(D->url), UDM_NULL2EMPTY(L->Data[j].url));
        if (rc) return(*pattern == 'U' ? -rc : rc);
        break;
      case 'S':
      case 's':
        rc= strcmp(UDM_NULL2EMPTY(D->section),
                   UDM_NULL2EMPTY(L->Data[j].section));
        if (rc) return(*pattern == 'S' ? -rc : rc);
        break;
    }
  }
  return 0;
}


static int
cmppatternRP(UDM_URLDATA *D1, UDM_URLDATA *D2)
{
  if (D1->coord > D2->coord) return -1;
  if (D1->coord < D2->coord) return  1;
  
  if (D1->pop_rank > D2->pop_rank) return -1;
  if (D1->pop_rank < D2->pop_rank) return 1;
  return 0;
}


static int
cmppatternR(UDM_URLDATA *D1, UDM_URLDATA *D2)
{
  if (D1->coord > D2->coord) return -1;
  if (D1->coord < D2->coord) return  1;
  return 0;
}


#if NOT_USED
static int cmpphr(UDM_PHR_DAT *p1, UDM_PHR_DAT *p2)
{
  if (p1->position < p2->position) return -1;
  if (p1->position > p2->position) return 1;
  return 0;
}
#endif

/****************************************************/

void UdmSortSearchWordsByWeight(UDM_URL_CRD *wrd,size_t num)
{
  if (wrd)UdmSort((void*)wrd,num,sizeof(*wrd),(udm_qsort_cmp)cmpword);
  return;
}

void UdmSortSearchWordsByURL(UDM_URL_CRD *wrd,size_t num)
{
  if(wrd)UdmSort((void*)wrd,num,sizeof(*wrd),(udm_qsort_cmp)cmpurlid);
  return;
}


static void
UdmCopyCoordToData(UDM_URLCRDLIST *L)
{
  size_t i;
  for (i= 0; i < L->ncoords; i++)
    L->Data[i].coord= L->Coords[i].coord;
}

static void
UdmCopyCoordFromData(UDM_URLCRDLIST *L)
{
  size_t i;
  for (i= 0; i < L->ncoords; i++)
  {
    L->Coords[i].url_id= L->Data[i].url_id;
    L->Coords[i].coord= L->Data[i].coord;
  }
}

static int
cmpsiteid2(UDM_URLDATA *p1, UDM_URLDATA *p2)
{
  if (p1->site_id  > p2->site_id)  return -1;
  if (p1->site_id  < p2->site_id)  return 1;
#if 0
  if (p1->coord    > p2->coord)    return -1;
  if (p1->coord    < p2->coord)    return 1;
  if (p1->pop_rank > p2->pop_rank) return -1;
  if (p1->pop_rank < p2->pop_rank) return 1;
#endif
  return 0;
}


void
UdmSortSearchWordsBySite(UDM_URLCRDLIST *L, size_t num)
{
  UdmCopyCoordToData(L);
  qsort((void*) L->Data, num, sizeof(*L->Data), (udm_qsort_cmp) cmpsiteid2);
  UdmCopyCoordFromData(L);
#if 0
  {
    size_t i;
    printf("L->num: %d num: %d\n", L->ncoords, num);
    for (i= 0; i < num; i++)
      printf("%d %d %d %d %.6f\n", i,
      L->Data[i].url_id, L->Data[i].site_id, L->Data[i].coord, L->Data[i].pop_rank);
  }
#endif
}

static void
UdmSortSearchWordsByPatternRP(UDM_URLCRDLIST *L, size_t num)
{
  UdmCopyCoordToData(L);
  qsort((void*) L->Data, num, sizeof(*L->Data), (udm_qsort_cmp) cmppatternRP);
  UdmCopyCoordFromData(L);
}


static void
UdmSortSearchWordsByPatternR(UDM_URLCRDLIST *L, size_t num)
{
  UdmCopyCoordToData(L);
  qsort((void*) L->Data, num, sizeof(*L->Data), (udm_qsort_cmp) cmppatternR);
  UdmCopyCoordFromData(L);
}


static size_t UdmH[] = {1, 5, 19, 41, 109, 209, 505, 929, 2161,
                        3905, 8929, 16001, 36289, 64769};
void UdmSortSearchWordsByPattern(UDM_RESULT *Res, UDM_URLCRDLIST *L,
                                 size_t num, const char *pattern)
{
  register ssize_t h, i, j;
  int s = 13;
  UDM_URL_CRD Crd;
  UDM_URLDATA Dat;

  if (!strcmp(pattern, "RP"))
  {
    UdmSortSearchWordsByPatternRP(L, num);
    goto ret;
  }
  if (!strcmp(pattern, "R"))
  {
    UdmSortSearchWordsByPatternR(L, num);
    goto ret;
  }

  while( (s > 0) && ((num / 3) < UdmH[s])) s--;
  while(s >= 0)
  {
    h = UdmH[s];
    for (j = h; j < (ssize_t)num; j++)
    {
      Crd = L->Coords[j];
      Dat = L->Data[j];

      i = j - h;
D4:
      if (cmppattern(L, &Crd, &Dat, i, pattern) <= 0) goto D6;
      L->Coords[i + h] = L->Coords[i];
      L->Data[i + h] = L->Data[i];
      i -= h;
      if (i >= 0) goto D4;

D6:
      L->Coords[i + h] = Crd;
      L->Data[i + h] = Dat;
    }
    s--;
  }

ret:

#if 0
  {
    size_t i;
    printf("L->num: %d num:%d pattern:%s\n", L->ncoords, num, pattern);
    for (i= 0; i < num; i++)
    {
      printf("[%d]%d %d %d %d %.6f\n", i,
             L->Coords[i].url_id, L->Coords[i].coord,
             L->Data[i].url_id, L->Data[i].coord, (float) L->Data[i].pop_rank);
    }
  }
#endif
  return;
}


/*#define DEBUG_TOP_SORT*/

/* Find topcount best results */
void UdmWrdTopSort(UDM_URL_CRD *wrd, size_t nwrd,size_t topcount)
{
  size_t j;
  UDM_URL_CRD w;
  
#ifdef DEBUG_TOP_SORT
  fprintf(stderr,"top+1=%d nwrd=%d\n",topcount+1,nwrd);
#endif
  
  UdmSortSearchWordsByWeight(wrd,topcount+1);
  for(j=topcount;j<nwrd;j++)
  {
    register int res;
    if (wrd[j].coord > wrd[topcount].coord) res = 1;
    else if (wrd[j].coord < wrd[topcount].coord) res = -1;
    else
/*    if(!(res=(wrd[j].coord-wrd[topcount].coord)))*/
      res = (wrd[topcount].url_id - wrd[j].url_id);
    
#ifdef DEBUG_TOP_SORT
fprintf(stderr,"(%d,%d) %d (%d,%d) %d\n",
    wrd[topcount].coord,wrd[topcount].url_id,topcount,
    wrd[j].coord,wrd[j].url_id,j);
#endif

    if(res>0)
    {
      size_t l,c,r;
      
      l=0;r=topcount;
      while(l<r)
      {
        c=(l+r)/2;
        if (wrd[c].coord > wrd[j].coord) res = 1;
        else if (wrd[c].coord < wrd[j].coord) res = -1;
        else
/*        if(!(res=(wrd[c].coord-wrd[j].coord)))*/
          res = (wrd[j].url_id - wrd[c].url_id);
        
        if(res>0)
        {
          l=c+1;
        }
        else
        {
          r=c;
        }
      }
      w=wrd[topcount];
      memmove(&wrd[r+1],&wrd[r],(topcount-r)*sizeof(*wrd));
      wrd[r]=wrd[j];
      wrd[j]=w;
    }
  }
}


#define UDM_MAX_FORMS 256
#define UDM_MAX_NORMS 64

/*
  All the following combinations should
  work and get as many uword forms as possible:

  1. uword doesn't exist in ispell, its synonym doesn't exist in ispell.
     This last combination should also work if no ispell dictionaries loaded.
     Just copy all synonyms into result.
  2. DONE: both norm(uword) and its synonym exist in ispell
  3. norm(uword) exists in ispell, its synonym doesn't exist in ispell.
  4. uword doesn't exist in ispell, its synonym exists in ispell.
*/

static UDM_WIDEWORDLIST *UdmAllForms1(UDM_AGENT *Indexer,
                                      UDM_WIDEWORDLIST *result,
                                      const UDM_WIDEWORD *uword)
{
  UDM_SPELLLISTLIST *SLL= &Indexer->Conf->Spells;
  UDM_AFFIXLISTLIST *ALL= &Indexer->Conf->Affixes;
  UDM_SYNONYMLIST   *SYN= &Indexer->Conf->Synonyms;
  char *Res[UDM_MAX_FORMS];
  char **ResCur= Res;
  char **ResEnd= Res + UDM_MAX_FORMS;
  char **R;
  UDM_AFFIXLIST *Al;  
  UDM_WIDEWORD w;
  UDM_CONV lcs_uni;
  UDM_CHARSET *lcs= Indexer->Conf->lcs;
  int sy= UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1);
  int sp= UdmVarListFindInt(&Indexer->Conf->Vars, "sp", 1);

  if (!sp)
    return NULL;

  for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
  {
    UDM_SPELLLIST *Sl;
    for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
    {
      if (!strcasecmp(Al->lang, Sl->lang) && !strcasecmp(Al->cset, Sl->cset))
      {
        UDM_SPELL Norm[UDM_MAX_NORMS];
        UDM_SPELL *NormEnd= Norm + UDM_MAX_NORMS;
        UDM_SPELL *NormCur= Norm;
        UDM_SPELL *N;
        char tmp[256];
        char *word= uword->word;
        
        if (lcs != Sl->cs)
        {
          UDM_CONV lcs_scs;
          size_t len= strlen(word);
          UdmConvInit(&lcs_scs, lcs, Sl->cs, UDM_RECODE_HTML);
          UdmConv(&lcs_scs, tmp, sizeof(tmp), word, len + 1);
          word= tmp;
        }
        
        NormCur+= UdmSpellNormalize(Sl, Al, word, NormCur, NormEnd-NormCur);
        
        if (sy && SYN->nsynonyms)
        {
          UDM_CONV scs_uni, uni_scs;
          UDM_WIDEWORD ww;
          UDM_WIDEWORDLIST *syn;
          int u[128];
          bzero((void*)&ww, sizeof(ww));
          ww.uword= u;
          ww.order= uword->order;
          UdmConvInit(&scs_uni, Sl->cs, &udm_charset_sys_int, UDM_RECODE_HTML);
          UdmConvInit(&uni_scs, &udm_charset_sys_int, Sl->cs, UDM_RECODE_HTML);
          /* 
            Find synonyms for each normal form
            and add the found synonyms into normalized
            list for futher denormalization.
          */
          for (N= Norm; N < NormCur; N++)
          {
            UdmConv(&scs_uni,(char*)&u,sizeof(u),N->word,strlen(N->word)+1);
            if ((syn= UdmSynonymListFind(SYN, &ww)))
            {
              UDM_WIDEWORD *W;
              for (W= syn->Word; W < syn->Word + syn->nwords; W++)
              {
                size_t ubytes= (W->ulen + 1) * sizeof(int);
                UdmConv(&uni_scs, tmp, sizeof(tmp), (char*) W->uword, ubytes);
                if (NormCur < NormEnd)
                {
                  NormCur+= UdmSpellNormalize(Sl, Al, tmp, NormCur, NormEnd-NormCur);
                }
              }
              UdmWideWordListFree(syn);
              UdmFree(syn);
            }
          }
        }
        
        for (N= Norm ; N < NormCur; N++)
        {
          if (ResCur < ResEnd)
          {
            size_t cres= 1;
            *ResCur= UdmStrdup(N->word);
            cres+= UdmSpellDenormalize(Sl, Al, N, ResCur+1, ResEnd-ResCur-1);
            if (lcs != Sl->cs)
            {
              size_t i;
              UDM_CONV scs_lcs;
              UdmConvInit(&scs_lcs, Sl->cs, lcs, UDM_RECODE_HTML);
              for (i=0; i < cres; i++)
              {
                UdmConv(&scs_lcs, tmp, sizeof(tmp),
                        ResCur[i], strlen(ResCur[i])+1);
                UdmFree(ResCur[i]);
                ResCur[i]= UdmStrdup(tmp);
              }
            }
            ResCur+= cres;
          }
        }
      }
    }
  }
  
  UdmConvInit(&lcs_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  
  bzero((void*)&w, sizeof(w));
  for (R=Res; R < ResCur; R++)
  {
    size_t nbytes;
    w.order= uword->order;
    w.phrpos= uword->phrpos;
    w.phrlen= uword->phrlen;
    w.count= 0;
    w.origin= UDM_WORD_ORIGIN_SPELL;
    w.word= *R;
    w.len= strlen(w.word);
    nbytes= (w.len + 1) * sizeof(int);
    w.uword= UdmRealloc(w.uword, nbytes);
    w.ulen= UdmConv(&lcs_uni, (char*) w.uword, nbytes, w.word, w.len + 1);
    UdmWideWordListAdd(result, &w);
    UdmFree(*R);
  }
  UdmFree(w.uword);


  return result;
}


static const char *translit_cyr_lat[]=
{
  "a",  "b",  "v",  "g",  "d",  "e",  "zh", "z",
  "i",  "j",  "k",  "l",  "m",  "n",  "o",  "p",
  "r",  "s",  "t",  "u",  "f",  "h",  "c",  "ch",
  "sh", "sch","`",  "y",  "'",  "`e", "yu", "ya",
  "",   "yo"
};


static const char *translit_lat_cyr[]=
{
  "&#x430;", "&#x431;", "&#x446;", "&#x434;",
  "&#x435;", "&#x444;", "&#x433;", "&#x445;",
  "&#x438;", "&#x439;", "&#x43a;", "&#x43b;",
  "&#x43c;", "&#x43d;", "&#x43e;", "&#x43f;",
  "&#x433;", "&#x440;", "&#x441;", "&#x442;",
  "&#x443;", "&#x432;", "&#x432;", "&#x43a;&#x441;",
  "&#x44b;", "&#x437;"
};


typedef struct udm_translit_complex_subst_st
{
  const char *from;
  const char *to;
} UDM_TRANSLIT_COMPLEX_SUBST;


static UDM_TRANSLIT_COMPLEX_SUBST translit_lat_cyr_complex[]=
{
#if NOT_YET
  {"`"  , "&#x44a;"},
  {"'"  , "&#x44c;"},
  {"`e" , "&#x44d;"},
#endif
  {"ch" , "&#x447;"},
  {"sch", "&#x449;"},
  {"ya" , "&#x44f;"},
  {"zh" , "&#x436;"},
  {"yo" , "&#x451;"},
  {"kh" , "&#x445;"},
  {"sh" , "&#x448;"},
#ifdef NOT_YET
  {"yu" , "&#x44e;"}, /* ambiguous: YERU + U, or YU   */
#endif
  {NULL, NULL}
};


typedef struct udm_translit_st
{
  size_t from;
  size_t to;
  const char * const *translit;
  UDM_TRANSLIT_COMPLEX_SUBST *complex;
} UDM_TRANSLIT_TABLE;


static UDM_TRANSLIT_TABLE tr_cyr_lat=
{
  0x430, 0x451, translit_cyr_lat, NULL
};


static UDM_TRANSLIT_TABLE tr_lat_cyr=
{
  0x61, 0x7A, translit_lat_cyr, translit_lat_cyr_complex
};


static int UdmAllFormsTranslit(UDM_AGENT *A, UDM_WIDEWORDLIST *result,
                               const UDM_WIDEWORD *uword,
                               const UDM_TRANSLIT_TABLE *tr)
{
  int *wrd, tword[128], *t, *te= tword + 128 - 2;
  int subst= 0;
  UDM_CHARSET *latin1= UdmGetCharSet("iso-8859-1");
  UDM_CONV l1_uni;
  UdmConvInit(&l1_uni, latin1, &udm_charset_sys_int, UDM_RECODE_HTML);
  for (wrd= uword->uword, t= tword; wrd && wrd[0] && t < te; )
  {
    if (*wrd >= tr->from && *wrd <= tr->to)
    {
      const char *repl= NULL;
      size_t len;
      UDM_TRANSLIT_COMPLEX_SUBST *cmpl;
      for (cmpl= tr->complex; cmpl && cmpl->from; cmpl++)
      {
        size_t pos;
        const char *from= cmpl->from;
        for (pos=0; from[pos] && from[pos] == wrd[pos]; pos++);
        if (!from[pos])
        {
          repl= cmpl->to;
          wrd+= pos;
          break;
        }
      }
      if (!repl)
      {
        repl= tr->translit[*wrd - tr->from];
        wrd++;
      }
      
      len= strlen(repl);
      len= UdmConv(&l1_uni, (char*) t, (te - t) * sizeof(*t), repl, len);
      t+= len / sizeof(*t);
      subst++;
    }
    else
    {
      *t++= *wrd++;
    }
  }
  *t= 0;
  if (subst)
  {
    UDM_WIDEWORD w;
    char lcsword[128];
    UDM_CONV uni_lcs;
    size_t nbytes= (t - tword + 1) * sizeof(*t);
    bzero((void*)&w, sizeof(w));
    UdmConvInit(&uni_lcs, &udm_charset_sys_int, A->Conf->lcs, UDM_RECODE_HTML);
    UdmConv(&uni_lcs, lcsword, sizeof(lcsword), (const char *) tword, nbytes);
    w.order= uword->order;
    w.phrpos= uword->phrpos;
    w.phrlen= uword->phrlen;
    w.count= 0;
    w.origin= UDM_WORD_ORIGIN_SYNONYM;
    w.word= lcsword;
    w.len= strlen(w.word);
    w.uword= tword;
    w.ulen= t - tword;
    UdmWideWordListAdd(result, &w);
  }
  return UDM_OK;
}


static int
UdmAllForms2(UDM_AGENT *Indexer,
             UDM_WIDEWORDLIST *result,
             UDM_WIDEWORD *uw)
{
  UdmAllForms1(Indexer, result, uw);
#ifdef HAVE_SQL
  {
    const char *sql= UdmVarListFindStr(&Indexer->Conf->Vars, "SQLWordForms", NULL);
    if (sql && Indexer->Conf->dbl.nitems)
    {
      char *word= uw->word;
      char buf[1024];
      size_t i, nrows;
      int uword[128];
      UDM_SQLRES SQLRes;
      UDM_CONV lcs_uni;
      UDM_WIDEWORD form;
      
      UdmBuildParamStr(buf, sizeof(buf), sql, &word, 1);
      if (UDM_OK != UdmSQLQuery(&Indexer->Conf->dbl.db[0], &SQLRes, buf))
        return UDM_ERROR;
      nrows= UdmSQLNumRows(&SQLRes);
      bzero((void*) &form, sizeof(UDM_WIDEWORD));
      form.order= uw->order;
      form.phrpos= uw->phrpos;
      form.phrlen= uw->phrlen;
      form.count= 0;
      form.uword= uword;
      form.origin= UDM_WORD_ORIGIN_SYNONYM;
      form.weight= 0;
      form.match= uw->match;
      UdmConvInit(&lcs_uni, Indexer->Conf->lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
      for (i= 0; i < nrows; i++)
      {
        form.len= UdmSQLLen(&SQLRes, 0, i);
        form.word= (char*) UdmSQLValue(&SQLRes, 0, i);
        form.ulen= UdmConv(&lcs_uni, (char*) uword, sizeof(uword), form.word, form.len + 1);
        UdmWideWordListAdd(result, &form);
      }
      UdmSQLFree(&SQLRes);
    }
  }
#endif
  return UDM_OK;
}


static UDM_WIDEWORDLIST *UdmAllForms(UDM_AGENT *Indexer,
                                     UDM_WIDEWORDLIST *result,
                                     UDM_WIDEWORD *uword)
{
  UDM_WIDEWORDLIST *uwordsyn;
  
  /*
    Generate all possible word forms for uword.
  */
  UdmAllForms2(Indexer, result, uword);
  
  if (UdmVarListFindBool(&Indexer->Conf->Vars, "tl", 0))
  {
    UDM_TRANSLIT_TABLE *tbl[]= {&tr_cyr_lat, &tr_lat_cyr, NULL}, **cur;
    for (cur= tbl; *cur; cur++)
    {
      UDM_WIDEWORDLIST translit;
      UdmWideWordListInit(&translit);
      UdmAllFormsTranslit(Indexer, &translit, uword, *cur);
      if (translit.nwords)
      {
        UDM_WIDEWORD *ww= &translit.Word[0];
        UdmWideWordListAdd(result, ww);
        UdmAllForms2(Indexer, result, ww);
      }
      UdmWideWordListFree(&translit);
    }
  }
  
  if (!UdmVarListFindInt(&Indexer->Conf->Vars, "sy", 1))
    return result;
  /*
     Combination one: uword is possibly a normalized form.
     Find all uword synonyms and then process then through
     ispell to generate all word forms for the synonyms.
  */
  if ((uwordsyn= UdmSynonymListFind(&Indexer->Conf->Synonyms, uword)))
  {
    UDM_WIDEWORD *ww;
    UDM_CONV uni_lcs;
    UdmConvInit(&uni_lcs, &udm_charset_sys_int, Indexer->Conf->lcs, UDM_RECODE_HTML); 

    for (ww= uwordsyn->Word; ww < &uwordsyn->Word[uwordsyn->nwords]; ww++)
    {
      char tmp[256];
      if (!UdmUniStrCmp(uword->uword, ww->uword))
        continue;
      ww->len= UdmConv(&uni_lcs, tmp, sizeof(tmp),
                       (char*) ww->uword, ww->ulen*sizeof(int));
      ww->word= tmp;
      ww->word[ww->len]= '\0';
      UdmWideWordListAdd(result, ww);
      UdmAllForms2(Indexer, result, ww);
      ww->len= 0;
      ww->word= NULL;
    }
    UdmWideWordListFree(uwordsyn);
    UdmFree(uwordsyn);
  }
  return result;
}


static int
UdmAutoPhraseChar(int ch)
{
  return ch == '_' || ch == '-' || ch == '.' || ch == '/';
}


static int
UdmStackItemAdd(UDM_RESULT *Res, UDM_STACK_ITEM *item)
{
  if (Res->nitems >= Res->mitems)
  {
    Res->mitems+= UDM_MAXSTACK;
    if (!(Res->items= (UDM_STACK_ITEM*)UdmRealloc(Res->items,
                                                  Res->mitems *
                                                  sizeof(UDM_STACK_ITEM))))
      return UDM_ERROR;
  }
  Res->items[Res->nitems]= item[0];
  Res->nitems++;
  return UDM_OK;
}


typedef struct udm_stack_parser_state_st
{
  int secno;
  int next_word_match_type;
  int use_numeric_operators;
  int word_match;
  int nphrasecmd;
  int auto_phrase;
  int phrpos;
  const char *lang;
} UDM_STACK_PARSER_STATE;



static int
UdmStackItemAddCmd(UDM_RESULT *Res,
                   UDM_STACK_PARSER_STATE *state,
                   int *lex, size_t length)
{
  int rc= UDM_OK;
  size_t i;
  UDM_STACK_ITEM item;
  
  for (i = 0; i < length; i++)
  {
     switch(lex[i])
     {
       case '&':
       case '+':
         item.cmd= UDM_STACK_AND;
         state->next_word_match_type= state->word_match;
         break;
       case '|':
         item.cmd= UDM_STACK_OR;
         state->next_word_match_type= state->word_match;
         break;
       case '~':
         item.cmd= UDM_STACK_NOT;
         state->next_word_match_type= state->word_match;
         break;
       case '(':
         item.cmd= UDM_STACK_LEFT;
         state->next_word_match_type= state->word_match;
         break;
       case ')':
         item.cmd= UDM_STACK_RIGHT;
         state->next_word_match_type= state->word_match;
         break;
       case '"':
         item.cmd= UDM_STACK_PHRASE;
         state->next_word_match_type= state->word_match;
         state->nphrasecmd++;
         break;
       case '<':
         state->next_word_match_type= state->use_numeric_operators ?
                                      UDM_MATCH_NUMERIC_LT : state->word_match;
         break;
       case '>':
         state->next_word_match_type= state->use_numeric_operators ? 
                                      UDM_MATCH_NUMERIC_GT : state->word_match;
         break;
       default:
         if (state->auto_phrase && !UdmAutoPhraseChar(lex[i]))
         {
           int quot= '"';
           item.cmd= UDM_STACK_PHRASE;
           item.arg= 0;
           UdmStackItemAddCmd(Res, state, &quot, 1);
           state->auto_phrase= 0;
         }
         continue;
     }
     /* Ignore all operators if we are in a phrase,  except phrase end. */
    if (!(state->nphrasecmd % 2) || item.cmd == UDM_STACK_PHRASE)
    {
      item.arg= 0;
      rc= UdmStackItemAdd(Res, &item);
      Res->ncmds++;
    }
  }
  return rc;
}


static int
UdmStackItemAddWord(UDM_AGENT *query, UDM_RESULT *Res,
                   UDM_STACK_PARSER_STATE *state,
                   int *lex, size_t length, int *lt,
                   int *uwrd, char *wrd, UDM_CONV *uni_lc)
{
  UDM_WIDEWORD OWord;
  UDM_WIDEWORDLIST Forms;
  UDM_STACK_ITEM item;
  int origin, rc, lt_auto_phrase= UdmAutoPhraseChar(lt[0]);
  size_t phrlen= 0;

  if(Res->WWList.nuniq >= UDM_MAXWORDPERQUERY-1)
    return UDM_OK;

  if (state->nphrasecmd % 2) /* in phrase */
  {
    if (((state->auto_phrase && !lt_auto_phrase) || lt[0] == '"'))
    {
      /* End of auto- or non-auto-phrase*/
      phrlen= state->phrpos + 1;
    }
  }
  else /* not in phrase */
  {
    if (lt_auto_phrase)
    {
      /* Start of auto-phrase */
      int quot= '"';
      state->auto_phrase= 1;
      item.cmd= UDM_STACK_PHRASE;
      item.arg= 0;
      UdmStackItemAddCmd(Res, state, &quot, 1);
      phrlen= 0;
    }
    else
      phrlen= 1; /* Single word */
  }
  
  item.cmd= UDM_STACK_WORD;
  item.arg= Res->WWList.nuniq;
  rc= UdmStackItemAdd(Res, &item);


  /*
    Check stopword only when full word.
    Substring searches should not exclude them.
  */
  if(state->word_match == UDM_MATCH_FULL &&
     (UdmStopListFind(&query->Conf->StopWords, wrd, state->lang) ||
      query->Conf->WordParam.min_word_len > length ||
      query->Conf->WordParam.max_word_len < length))
  {
    origin= UDM_WORD_ORIGIN_STOP;
    Res->items[Res->nitems - 1].cmd= UDM_STACK_STOP;
  }
  else
  {
    origin= UDM_WORD_ORIGIN_QUERY;
  }

  OWord.len= strlen(wrd);
  OWord.order= Res->WWList.nuniq;
  OWord.phrpos= state->phrpos;
  OWord.phrlen= phrlen;
  OWord.count= 0;
  OWord.word= wrd;
  OWord.uword= uwrd;
  OWord.origin = origin;
  OWord.match= state->next_word_match_type;
  OWord.secno= state->secno;
  UdmWideWordListAdd(&Res->WWList, &OWord);
  if (state->nphrasecmd % 2)
    state->phrpos++;
      
  if (origin == UDM_WORD_ORIGIN_STOP)
    return UDM_OK;
      
  UdmWideWordListInit(&Forms);
  if(UdmAllForms(query,&Forms,&OWord))
  {
    UDM_WIDEWORD FWord;
    size_t frm;
    for (frm= 0; frm < Forms.nwords ; frm++)
    {
      UdmConv(uni_lc,wrd,12*query->Conf->WordParam.max_word_len,
             (char*)(Forms.Word[frm].uword),
              sizeof(Forms.Word[frm].uword[0])*(UdmUniLen(Forms.Word[frm].uword)+1));
      FWord.len= strlen(wrd);
      FWord.order= Res->WWList.nuniq;
      FWord.phrpos= state->phrpos;
      FWord.phrlen= phrlen;
      FWord.count= 0;
      FWord.word= wrd;
      FWord.uword= Forms.Word[frm].uword;
      FWord.origin = Forms.Word[frm].origin;
      FWord.match= state->next_word_match_type;
      FWord.secno= state->secno;
      UdmWideWordListAdd(&Res->WWList,&FWord);
/*    UdmLog(query, UDM_LOG_DEBUG, "Word form: [%d] %s", FWord.origin, wrd);*/
    }
  }
  UdmWideWordListFree(&Forms);
  Res->WWList.nuniq++;
  return rc;
}


int UdmPrepare(UDM_AGENT * query,UDM_RESULT *Res)
{
  UDM_CHARSET * browser_cs, * local_cs, *sys_int;
  int  ctype;
  int * ustr, * lt, * lex;
  size_t ulen;
  const char * txt = UdmVarListFindStr(&query->Conf->Vars,"q","");
  const char * qprev = UdmVarListFindStr(&query->Conf->Vars,"qprev","");
  char *ltxt;
  size_t wlen, llen;
  char *wrd, *clex;
  int *uwrd;
  UDM_CONV uni_lc, bc_uni, bc_lc;
  const char *lang;
  UDM_STACK_PARSER_STATE state;
  
  state.secno= 0;
  state.use_numeric_operators= UdmVarListFindBool(&query->Conf->Vars, "UseNumericOperators", 0);
  state.nphrasecmd= 0;
  state.word_match= UdmMatchMode(UdmVarListFindStr(&query->Conf->Vars, "wm", "wrd"));
  state.next_word_match_type= state.word_match;
  state.lang= UdmVarListFindStr(&query->Conf->Vars, "g", NULL);
  state.auto_phrase= 0;
  state.phrpos= 0;
  
  if ((wrd = (char*)UdmMalloc(query->Conf->WordParam.max_word_len * 12 + 1)) == NULL) return 0;
  if ((uwrd = (int*)UdmMalloc(sizeof(int) * (query->Conf->WordParam.max_word_len + 1))) == NULL) { UDM_FREE(wrd); return 0; }


  if (!(browser_cs = query->Conf->bcs))
    browser_cs=UdmGetCharSet("iso-8859-1");
  
  if(!(local_cs = query->Conf->lcs))
    local_cs=UdmGetCharSet("iso-8859-1");
  
  sys_int= &udm_charset_sys_int;
  
  UdmConvInit(&bc_uni,browser_cs,sys_int,UDM_RECODE_HTML);
  UdmConvInit(&uni_lc,sys_int,local_cs,UDM_RECODE_HTML);
  UdmConvInit(&bc_lc,browser_cs,local_cs,UDM_RECODE_HTML);
  
  ulen=strlen(txt);
  ustr=(int*)(UdmMalloc((sizeof(int))*(ulen+1)));
  UdmConv(&bc_uni,(char*)ustr,sizeof(ustr[0])*(ulen+1),txt,ulen+1);
  
  /* Create copy of query, converted into LocalCharset (for UdmTrack) */
  llen = ulen * 14 + 1;
  ltxt=(char*)UdmMalloc(llen);
  UdmConv(&uni_lc,ltxt,llen,(char*)ustr,bc_uni.obytes);
  ltxt[uni_lc.obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"q",ltxt);  /* "q-lc" was here */
  UDM_FREE(ltxt);
  
  llen = strlen(qprev);
  ltxt=(char*)UdmMalloc(llen*14+1);
  UdmConv(&bc_lc,ltxt,llen*14+1,qprev,llen);
  ltxt[bc_lc.obytes]='\0';
  UdmVarListReplaceStr(&query->Conf->Vars,"qprev",ltxt);
  UDM_FREE(ltxt);
  
  /* Parse query and build boolean search stack*/
  UdmUniStrToLower(ustr);
  switch(browser_cs->family)
  {
    case UDM_CHARSET_CHINESE_SIMPLIFIED:
    case UDM_CHARSET_CHINESE_TRADITIONAL: lang = "zh"; break;
    case UDM_CHARSET_JAPANESE: lang = "ja"; break;
    case UDM_CHARSET_THAI: lang = "th"; break;
    default: lang = "";
  }
  ustr = UdmUniSegment(query, ustr, lang);

  lex = UdmUniGetSepToken(ustr, &lt , &ctype);
  for ( ;lex; lex= UdmUniGetSepToken(NULL, &lt, &ctype))
  {
    wlen=lt-lex;
    memcpy(uwrd, lex, (udm_min(wlen, query->Conf->WordParam.max_word_len)) * sizeof(int));
    uwrd[udm_min(wlen, query->Conf->WordParam.max_word_len)] = 0;
    UdmConv(&uni_lc, wrd, query->Conf->WordParam.max_word_len * 12,(char*)uwrd, sizeof(uwrd[0])*(wlen+1));
    clex = UdmTrim(wrd, " \t\r\n");
    
    if (ctype == UDM_UNI_SEPAR)
    {
      UdmStackItemAddCmd(Res, &state, lex, wlen);
    } 
    else
    {
      UDM_VAR *Section;

      if (lt[0] == ':')
      {
        if ((Section= UdmVarListFind(&query->Conf->Sections, wrd)))
        {
          state.secno= Section->section;
          continue;
        }
        if (wlen > 5 && !strncmp(wrd, "secno", 5))
        {
          state.secno= atoi(wrd + 5);
          continue;
        }
      }
      
      UdmStackItemAddWord(query, Res, &state, lex, wlen, lt, uwrd, wrd, &uni_lc);
    }
  }
  
  if (state.nphrasecmd & 1)
  {
    UDM_STACK_ITEM item;
    item.cmd= UDM_STACK_PHRASE;
    item.arg= 0;
    UdmStackItemAdd(Res, &item);
    Res->ncmds++;
  }
  UDM_FREE(ustr);
  UDM_FREE(uwrd);
  UDM_FREE(wrd);
  Res->WWList.wm= state.word_match;
  return UDM_OK;
}


typedef struct udm_score_param_st
{
  unsigned int *R;
  unsigned int *D;
  size_t ncosine;
  size_t nsections;
  float Rsum_factor;
} UDM_SCORE_PARAM;



/*
  R[i] and D[i] are in the range 0..64.
  ns is between 1..256
*/
static
inline uint4 UdmCalcCosineWeight(UDM_SCORE_PARAM *score_param,
                                 float nwords_factor)
{
  size_t i, num= score_param->ncosine;
  unsigned int *D= score_param->D;
  unsigned int *R= score_param->R;
  float res;

  size_t Dsum=  D[0] * D[0];
  size_t RDsum= R[0] * D[0];

  for (i=1; i < num; i++)
  {
    if (D[i])
    {
      Dsum+=  D[i] * D[i];
      RDsum+= R[i] * D[i];
    }
  }
  
  res= score_param->Rsum_factor * nwords_factor *
       (float) RDsum / sqrt(Dsum) + 0.5;

#if 0
  for (i=0; i < num; i++)
  {
    if (D[i])
      fprintf(stderr, "[%d]=%d %d\n", i, R[i], D[i]);
  }
  fprintf(stderr, "Rsum_factor: %g  Dsum: %d  RD: %d cos: %d\n",
          Rsum_factor, Dsum, RDsum, (uint4) res);
#endif
  
  return (uint4) res;
}


static unsigned int
CalcAverageWordDistance(UDM_WIDEWORDLIST *WWL,
                        UDM_URL_CRD *phr, size_t num,
                        unsigned int dst_weight);

static uint4
UdmCalcCosineWeightQuick(UDM_SCORE_PARAM *score_param,
                         UDM_WIDEWORDLIST *WWL, 
                         UDM_URL_CRD *Crd, size_t ncoords,
                         int *wf2, size_t nwf_num,
                         size_t dst_offs,
                         unsigned int dst_weight,
                         float nwords_factor)
{
  float res;
  size_t Dsum= 0;
  size_t RDsum= 0;
  size_t add;
  char *added= (char*) score_param->D;
  
  bzero(added, score_param->ncosine);

  add= CalcAverageWordDistance(WWL, Crd, ncoords, dst_weight);
  Dsum= add * add;
  RDsum= score_param->R[dst_offs] * add;
  
  for( ; ncoords; Crd++, ncoords--)
  {
    uint4 coord= Crd->coord;
    size_t wrdsec= UDM_WRDSEC(coord);
    size_t wrdnum= WWL->Word[UDM_WRDNUM(coord)].order;
    UDM_WIDEWORD *W= &WWL->Word[wrdnum];
    if (wrdsec < score_param->nsections)
    {
      size_t offs= score_param->nsections * wrdnum + wrdsec;
      if (!added[offs])
      {
        add= wf2[wrdsec] + W->weight;
        Dsum+= add*add;
        RDsum+= add * score_param->R[offs];
        added[offs]= 1;
      }
    }
  }
  res= score_param->Rsum_factor * nwords_factor *
        (float) RDsum / sqrt(Dsum) + 0.5;

  return (uint4) res;
}



static int UdmOriginWeight(int origin)
{
  switch(origin)
  {
    case UDM_WORD_ORIGIN_QUERY: return 3;
    case UDM_WORD_ORIGIN_SPELL: return 1;
  }
  return 0;
}


static void UdmWideWordListSetOriginWeight(UDM_WIDEWORDLIST *WWList)
{
  size_t i;
  for (i=0; i < WWList->nwords; i++)
    WWList->Word[i].weight= UdmOriginWeight(WWList->Word[i].origin);
}


typedef struct udm_coord_st
{
  uint4 pos;
  uint4 ord;
} UDM_COORD;


static unsigned int
CalcAverageWordDistance(UDM_WIDEWORDLIST *WWL,
                        UDM_URL_CRD *phr, size_t num,
                        unsigned int dst_weight)
{
  size_t sum, np;
  UDM_COORD prev;
  UDM_COORD curr;
  UDM_COORD next;
  UDM_URL_CRD *last;

  if (num < 2)
    return 0;

  
  if (num == 2)
  {
    size_t num0= UDM_WRDNUM(phr[0].coord);
    size_t num1= UDM_WRDNUM(phr[1].coord);
    size_t ord0= WWL->Word[num0].order;
    size_t ord1= WWL->Word[num1].order;
    size_t pos0= UDM_WRDPOS(phr[0].coord);
    size_t pos1= UDM_WRDPOS(phr[1].coord);
    size_t res= (ord0 == ord1) ? 0 : pos1 > pos0 ? pos1 - pos0 : pos0 - pos1;
    return res > 0 ? (res - 1) * dst_weight / 255 : 0;
  }
  
  
  np= sum= 0;
  last= phr + num;
  
  prev.pos= UDM_WRDPOS(phr->coord);
  prev.ord= WWL->Word[UDM_WRDNUM(phr->coord)].order;
  phr++;
  
  curr.pos= UDM_WRDPOS(phr->coord);
  curr.ord= WWL->Word[UDM_WRDNUM(phr->coord)].order;
  phr++;
  
  for (; phr < last; phr++)
  {
    next.pos= UDM_WRDPOS(phr->coord);
    next.ord= WWL->Word[UDM_WRDNUM(phr->coord)].order;
    
    if (prev.ord == curr.ord)
    {
      if (curr.ord == next.ord)
      {
        /* w1 w1 w1 */
      }
      else
      {
        /* w1 w1 w2 */
      }
    }
    else
    {
      if (curr.ord == next.ord)
      {
        /* w1 w2 w2 */
      }
      else
      {
        uint4 diff1, diff2;
        diff1= curr.pos - prev.pos;
        diff2= next.pos - curr.pos;
        if (diff1 > 0) diff1--;
        if (diff2 > 0) diff2--;
        if (prev.ord == next.ord)
        {
          /* w1 w2 w1 */
          sum+= diff1 < diff2 ? diff1 : diff2;
          np++;
        }
        else
        {
          /* w1 w2 w3 */
          sum+= diff1 + diff2;
          np+= 2;
        }
      }
    }
    prev= curr;
    curr= next;
  }
  /* return (np) ? ((float) sum / np) : 0; */
  return (np) ? (sum * dst_weight / np / 255) : 0;
}

#if 0
static void
UdmPrintCoords(UDM_URL_CRD *Coords, size_t ncoords)
{
  size_t i;
  for (i=0 ; i < ncoords; i++)
    fprintf(stderr, "%p %d %d %d\n", Coords+i, i,
                    Coords[i].url_id, Coords[i].coord);
}
#endif

static inline void CheckPhrase(
       UDM_WIDEWORDLIST *WWL,
       UDM_STACK_ITEM *query, size_t nitems,
       UDM_URL_CRD *coords, size_t ncoords,
       size_t *count)
{
  size_t q;
  size_t start, end, arg, i;
  size_t rstart, rend;
  size_t j, d, delta;
  UDM_URL_CRD *prev;

  /* find opening phrase command */
  for (q= 0; q < nitems; q++)
  {
    if (query[q].cmd != UDM_STACK_PHRASE) continue;

    /* find closing phrase command */
    start= q + 1;
    for (end= start; end < nitems && query[end].cmd != UDM_STACK_PHRASE; end++);
    q= end;
    arg= 0;

    /* skip trailing stopwords for now */
    /* TODO: we have to check document length (for phrases like "word1 stopword1") */
    for (rstart= start; rstart < end && query[rstart].cmd == UDM_STACK_STOP; rstart++);
    for (rend= end; rend > rstart && query[rend].cmd == UDM_STACK_STOP; rend--);

    /* if phrase contains stopwords only, we assume this document is found */
    if (rstart == rend) arg= 1;
    else for (i= 0; i < ncoords; i++)
    {
      size_t ord_i= WWL->Word[UDM_WRDNUM(coords[i].coord)].order;
      if (query[rstart].arg != ord_i) continue;
      prev = &coords[i];
      d= i + 1;
      delta= 1;

      for (j= rstart + 1; j < rend; j++)
      {
        if (query[j].cmd == UDM_STACK_STOP)
        {
          delta++;
          continue;
        }

        /* find coord for this word */
        while (d < ncoords &&
               (UDM_WRDPOS(prev->coord) == UDM_WRDPOS(coords[d].coord) ||
                (UDM_WRDPOS(prev->coord) + delta == UDM_WRDPOS(coords[d].coord) &&
                 WWL->Word[UDM_WRDNUM(coords[d].coord)].order != query[j].arg))) d++;

        if (d == ncoords ||
            UDM_WRDPOS(prev->coord) != UDM_WRDPOS(coords[d].coord) - delta ||
            query[j].arg != WWL->Word[UDM_WRDNUM(coords[d].coord)].order) break;

        delta= 1;
        prev= &coords[d];
      }
      if (j == rend)
      {
        arg= 1;
        break;
      }
    }

    for (i= rstart; i < rend; i++)
      if (query[i].cmd == UDM_STACK_WORD) count[query[i].arg]= arg;
  }
}

/*
#define DEBUG_REL 1
*/

static void UdmGroupByURLNewRel(UDM_RESULT *Res)
{
  urlid_t prev_id;
  size_t prev_num= 0;
  size_t N= 0;
  size_t M= Res->WWList.nuniq;
  size_t ndoc= 0; 
  size_t i;
  UDM_URL_CRD *Crd = Res->CoordList.Coords;
  float  TFi[UDM_MAXWORDPERQUERY+1];
  float   Ci[UDM_MAXWORDPERQUERY+1];
  float   Vi[UDM_MAXWORDPERQUERY+1];    
  float   Wi[UDM_MAXWORDPERQUERY+1];
  float   Di[UDM_MAXWORDPERQUERY+1];
    
  bzero(TFi, sizeof(TFi));
  bzero(Ci, sizeof(Ci));
  bzero(Vi, sizeof(Vi));
  bzero(Di, sizeof(Di));
  
  
  /* N - number of documents */
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      N++;
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }

#ifdef DEBUG_REL
    if (i < Res->CoordList.ncoords)
    {
      fprintf(stderr,"[%d]doc=%d sec=%d wrd=%d pos=%d\n",
              i,Crd[i].url_id,
              UDM_WRDSEC(Crd[i].coord),
              UDM_WRDNUM(Crd[i].coord),
              UDM_WRDPOS(Crd[i].coord));
    }
#endif
  }
    
    
  /* TFi */
  for (i=0; i < Res->CoordList.ncoords; i++)
  {
    TFi[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
  /* Add fictious word */
  TFi[M]= ((float) Res->CoordList.ncoords) / M;
  M++;
    
    
    
  /* Ci */
  for (i=0; i < M; i++)
  {
     Ci[i]= ((float) TFi[i]) / N;
#ifdef DEBUG_REL
     fprintf(stderr,"[%d] TDi=%d Ci=%f\n",i,TFi[i], Ci[i]);
#endif
  }
    
    
  /* Vi */
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    size_t k;
      
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      Di[M-1]= TFi[M-1]/N; /* fictious word */
      for (k=0; k < M; k++)
      {
        Vi[k]+= (Di[k] - Ci[k]) * (Di[k] - Ci[k]);
      }
      bzero(Di, sizeof(Di));
        
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }
    if (i < Res->CoordList.ncoords)
      Di[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
    
  for (i=0; i < M; i++)
  {
    Vi[i]/= (N-1);
    Wi[i]= Ci[i] ? TFi[i]*Vi[i]/Ci[i]/Ci[i] : 0;
#ifdef DEBUG_REL
    fprintf(stderr,"[%d] Vi=%f Wi=%f\n", i, Vi[i],Wi[i]);
#endif
  }
    
    
  bzero(Di,sizeof(Di));
  for (prev_id= Crd[0].url_id, i=0; i<= Res->CoordList.ncoords; i++)
  {
    size_t k;
    
    if ( (i==Res->CoordList.ncoords) || (prev_id != Crd[i].url_id))
    {
      float sumWiDi= 0;
      float sumDi2= 0;
      float sumWi2= 0;
      float res;
      size_t n= 1;
        
      Di[M-1]= TFi[M-1]/N;  /* fictious word */
       
      for (k=0; k < M; k++)
      {
        sumWiDi+= Wi[k] * Di[k] / (float)M;
        sumDi2+=  Di[k] * Di[k] / (float)M;
        sumWi2+=  Wi[k] * Wi[k];
        if (k + 1 < M)
          n*= Di[k];
      }
        
      res= sumWiDi / sqrt(sumDi2 * sumWi2);
      Crd[ndoc].url_id= prev_id;
      Crd[ndoc].coord= res * 100000;
        
#ifdef DEBUG_REL
      fprintf(stderr,"RES[%d]=%f %f %f\n",
        prev_id, res, sumWiDi, sqrt(sumDi2 * sumWi2));
#endif 
        
      ndoc++;
        
      bzero(Di, sizeof(Di));
      if (i < Res->CoordList.ncoords)
      {
        prev_id= Crd[i].url_id;
        prev_num= i;
      }
    }
      
    if (i <= Res->CoordList.ncoords)
      Di[UDM_WRDNUM(Crd[i].coord)]++;
  }
    
  Res->CoordList.ncoords= ndoc;
  return;
  
}


#define NUMWORD_FACTOR_SIZE 256
#define NUMWORD_FACTOR(nwf, n) (nwf[n >= NUMWORD_FACTOR_SIZE ? NUMWORD_FACTOR_SIZE - 1 : n])
#define MAXCOORD_FACTOR(factor, max_coord) ((1-((float)factor*(float)max_coord)))
#define MINCOORD_FACTOR(factor, coord) ((float) 0x1000 / (float) (0x1000 + factor * (coord - 1)))

static
void UdmNumWordFactorInit(float k, float *numword_factor, size_t ncoords)
{
  size_t i;
  if (k > 1) k= 1;
  if (k < 0) k= 0;
  for (i= 0; i < NUMWORD_FACTOR_SIZE; i++)
  {
    numword_factor[i]= (1-k) + ((float) i / ncoords)*k;
  }
}


static
void UdmNWFNormalize(unsigned int *D, int *nwf, size_t nwf_num)
{
  size_t i, n;
  /* Find how many sections we have */
  for (i=0, n= 0 ; i < nwf_num; i++)
  {
    if (D[i])
      n++;
  }
  
  for (i=0 ; i < nwf_num; i++)
  {
    /*
      Affect result weight when this section is the only one
    */
    D[i]= (D[i] && n <= 1) ? nwf[i] * 16 : 0;
  }
}


static void FillD(unsigned int *D,
                  UDM_WIDEWORDLIST *WWL, 
                  UDM_URL_CRD *Crd, size_t ncoords,
                  size_t nsections,
                  int *wf2, size_t nwf_num,
                  unsigned int *D_nwf_offs,
                  unsigned int *D_dst_offs,
                  unsigned int dst_weight)
{
  *D_dst_offs= CalcAverageWordDistance(WWL, Crd, ncoords, dst_weight);
  for( ; ncoords; Crd++, ncoords--)
  {
    uint4 coord= Crd->coord;
    size_t wrdsec= UDM_WRDSEC(coord);
    size_t wrdnum= WWL->Word[UDM_WRDNUM(coord)].order;
    UDM_WIDEWORD *W= &WWL->Word[wrdnum];
    if (wrdsec < nsections)
    {
      D[nsections * wrdnum + wrdsec]= wf2[wrdsec] + W->weight;
      if (nwf_num)
        D_nwf_offs[wrdsec]= 1;
    }
  }
}


static void
UdmGroupByURLInternal(UDM_AGENT *query,UDM_RESULT *Res, int search_mode)
{
  UDM_STACK_ITEM *temp_items;
  size_t  i, D_size, nitems = Res->nitems;
  size_t  *count, count_size = Res->WWList.nuniq * sizeof(size_t);
  size_t nsections = UdmVarListFindInt(&query->Conf->Vars, "NumSections", 256);
  size_t dst_offs= Res->WWList.nuniq * nsections;
  size_t nwf_offs= Res->WWList.nuniq * nsections + 1;
  size_t nwf_num= 0;  /* How many words in this section */
  size_t numcosine;
  int wf[256], wf2[256], nwf[256];
  unsigned int *R, *D, *D_dst_offs, *D_nwf_offs;
  size_t Rsum;
  UDM_URL_CRD *Crd= Res->CoordList.Coords;
  UDM_URL_CRD *CrdTo= Res->CoordList.Coords;
  UDM_URL_CRD *CrdFrom, *CrdCurr;
  UDM_URL_CRD *CrdLast= Res->CoordList.Coords + Res->CoordList.ncoords;
  UDM_WIDEWORD *Res_WWList_Word= Res->WWList.Word;
  UDM_SCORE_PARAM score_param;
  float Rsum_factor;
  float numword_factor[NUMWORD_FACTOR_SIZE];
  float numwordfactor= ((float)UdmVarListFindDouble(&query->Conf->Vars,
                                                "NumWordFactor", 25.5)) / 255;
  float max_coord_factor= ((float)UdmVarListFindInt(&query->Conf->Vars,
                                             "MaxCoordFactor", 255)) / 0xFFFFFF;
  unsigned int dst_weight= (unsigned int) UdmVarListFindInt(&query->Conf->Vars,
                                                 "WordDistanceWeight", 255);

  unsigned int  MinCoordFactor= UdmVarListFindInt(&query->Conf->Vars,
                                                  "MinCoordFactor", 0);
  unsigned int numdistinctwordfactor= (unsigned int) UdmVarListFindInt(&query->Conf->Vars,
                                                     "NumDistinctWordFactor", 0);
  float numdistinctword_factor[NUMWORD_FACTOR_SIZE];
  if(!Res->CoordList.ncoords) return;

  if (0)
  {
    UdmGroupByURLNewRel(Res);
    return;
  }
  
  for (i= 0; i < Res->WWList.nuniq; i++)
  {
    unsigned int k= numdistinctwordfactor;
    float x= (float) i / Res->WWList.nuniq;
    numdistinctword_factor[i]= ((x * x * x * x * k) + (256-k)) / 256;
  }
  
  UdmWideWordListSetOriginWeight(&Res->WWList);
  UdmNumWordFactorInit(numwordfactor, numword_factor, Res->CoordList.ncoords);
  UdmWeightFactorsInit(UdmVarListFindStr(&query->Conf->Vars, "wf", ""), wf);
  for (i= 0; i < 256; i++)
    wf2[i]= wf[i] << 2;
  if (UdmVarListFindStr(&query->Conf->Vars, "nwf", NULL))
  {
    UdmWeightFactorsInit(UdmVarListFindStr(&query->Conf->Vars, "nwf", ""), nwf);
    nwf_num= nsections;
  }  

  numcosine= Res->WWList.nuniq * nsections + 1 + nwf_num;
  
  D_size= numcosine * sizeof(unsigned int);
  
  count= (size_t*)UdmMalloc(count_size);
  R= (unsigned int*)UdmMalloc(D_size);
  D= (unsigned int*)UdmMalloc(D_size);
  temp_items = (UDM_STACK_ITEM*)UdmMalloc((Res->nitems + 1) * sizeof(UDM_STACK_ITEM) * 2);
  if (!count || !R || !D || !temp_items)
    goto err;

  bzero((void*) R, D_size);
  D_dst_offs= D + dst_offs;
  D_nwf_offs= D + nwf_offs;
  
  for(Rsum=0, i= 0; i < Res->WWList.nwords; i++)
  {
    size_t secno;
    if (Res_WWList_Word[i].origin != UDM_WORD_ORIGIN_QUERY)
      continue;
    for (secno= 0; secno < nsections; secno++)
    {
      size_t offs= Res_WWList_Word[i].order * nsections + secno;
      R[offs] = wf2[secno] + Res_WWList_Word[i].weight;
      Rsum+= R[offs] * R[offs];
    }
  }
  

  Rsum+= R[dst_offs] * R[dst_offs];
  Rsum_factor= 100000 / sqrt(Rsum);
  
  
  if (Res->ncmds > 0 || search_mode == UDM_MODE_BOOL)
  {
    size_t j;
    int inphrase, add_cmd= UDM_STACK_AND;
    
    switch(search_mode)
    {
      case UDM_MODE_ANY:
        add_cmd = UDM_STACK_OR;
        break;
      case UDM_MODE_BOOL:
      case UDM_MODE_ALL:
        add_cmd = UDM_STACK_AND;
        break;
    }
    temp_items[0] = Res->items[0];
    inphrase = (Res->items[0].cmd == UDM_STACK_PHRASE) ? 1 : 0;

    for (i= 1, j= 1; i < Res->nitems; i++)
    {
      /*
       * If previous item is WORD or PHRASE or RIGHT or STOPWORD
       * and next item is WORD or PHRASE or LEFT or STOPWORD
       * and we are not in phrase
       * we have to insert search mode dependent operator.
       */
      if ((Res->items[i - 1].cmd == UDM_STACK_WORD ||
           Res->items[i - 1].cmd == UDM_STACK_STOP ||
           Res->items[i - 1].cmd == UDM_STACK_PHRASE ||
	   Res->items[i - 1].cmd == UDM_STACK_RIGHT) &&
          (Res->items[i].cmd == UDM_STACK_WORD ||
	   Res->items[i].cmd == UDM_STACK_STOP ||
           Res->items[i].cmd == UDM_STACK_PHRASE ||
	   Res->items[i].cmd == UDM_STACK_LEFT) &&
	  ! inphrase)
      {
        temp_items[j].cmd = add_cmd;
        temp_items[j].arg = 0;
	j++;
      }
      if (Res->items[i].cmd == UDM_STACK_PHRASE) inphrase = ! inphrase;
      temp_items[j++] = Res->items[i];
    }
    search_mode = UDM_MODE_BOOL;
    nitems = j;
  }

  score_param.R= R;
  score_param.D= D;
  score_param.ncosine= numcosine;
  score_param.nsections= nsections;
  score_param.Rsum_factor= Rsum_factor;

  for (CrdFrom= Crd ; CrdFrom < CrdLast ; CrdFrom= CrdCurr)
  {
    float nword_factor;
    size_t z, phr_n;
    size_t max_coord= CrdFrom->coord;
    size_t min_coord= CrdFrom->coord;
    bzero((void*)count, count_size);
    
    for (CrdCurr= CrdFrom ;
         CrdCurr < CrdLast && CrdCurr->url_id == CrdFrom->url_id;
         CrdCurr++)
    {
      count[Res_WWList_Word[UDM_WRDNUM(CrdCurr->coord)].order]++;
      if (max_coord < CrdCurr->coord)
        max_coord= CrdCurr->coord;
      if (min_coord > CrdCurr->coord)
        min_coord= CrdCurr->coord;
    }
    
    max_coord= UDM_WRDPOS(max_coord);
    min_coord= UDM_WRDPOS(min_coord);
    phr_n= CrdCurr - CrdFrom;
    
    switch(search_mode)
    {
      case UDM_MODE_BOOL:
        CheckPhrase(&Res->WWList, temp_items, nitems, CrdFrom, phr_n, count);
        if(!UdmCalcBoolItems(temp_items, nitems, count))
          continue;
        break;
        
      case UDM_MODE_ALL:
        if (phr_n < Res->WWList.nuniq)
          continue;
        for (z = 0; z < Res->WWList.nuniq; z++) if (count[z] == 0) break;
        if (z < Res->WWList.nuniq && count[z] == 0) continue;
    }
    nword_factor= NUMWORD_FACTOR(numword_factor, phr_n) *
                  MAXCOORD_FACTOR(max_coord_factor, max_coord) *
                  MINCOORD_FACTOR(MinCoordFactor, min_coord);

    if (search_mode != UDM_MODE_ALL)
    {
      size_t nuniq= 0;
      for (z= 0; z < Res->WWList.nuniq; z++)
      {
        if (count[z])
          nuniq++;
      }
      if (nuniq < Res->WWList.nuniq)
        nword_factor= nword_factor * numdistinctword_factor[nuniq];
    }

    if (nwf_num)
    {
      bzero((void*)D, D_size);
      FillD(D, &Res->WWList, CrdFrom, phr_n,
            nsections, wf2, nwf_num, D_nwf_offs, D_dst_offs, dst_weight);
      UdmNWFNormalize(D_nwf_offs, nwf, nwf_num);
      CrdTo->coord= UdmCalcCosineWeight(&score_param, nword_factor);
    }
    else
    {
      CrdTo->coord= UdmCalcCosineWeightQuick(&score_param,
                                             &Res->WWList,
                                             CrdFrom, phr_n,
                                             wf2, nwf_num,
                                             dst_offs, dst_weight,
                                             nword_factor);
    }
    CrdTo->url_id= CrdFrom->url_id;
    CrdTo++;
  }
  
  Res->CoordList.ncoords= CrdTo - Crd;

err:

  UDM_FREE(temp_items);
  UDM_FREE(D);
  UDM_FREE(R);
  UDM_FREE(count);
  return;
}


void UdmGroupByURL(UDM_AGENT *query,UDM_RESULT *Res)
{
  int search_mode= UdmSearchMode(UdmVarListFindStr(&query->Conf->Vars, "m", "all"));
  UDM_URL_CRD *Coords= NULL;
  size_t threshold= UdmVarListFindInt(&query->Conf->Vars, "StrictModeThreshold", 0);
  size_t ncoords= (search_mode == UDM_MODE_ALL) && threshold ?
                  Res->CoordList.ncoords : 0;
  
  if (ncoords)
  {
    size_t nbytes;
    ncoords= Res->CoordList.ncoords;
    nbytes= ncoords * sizeof(UDM_URL_CRD);
    if ((Coords= (UDM_URL_CRD*) UdmMalloc(nbytes)))
      memcpy(Coords, Res->CoordList.Coords, nbytes);
  }
  
  UdmGroupByURLInternal(query, Res, search_mode);
  
  if (ncoords && (Res->CoordList.ncoords < threshold) && Coords)
  {
    size_t nbytes= ncoords * sizeof(UDM_URL_CRD);
    size_t strict_mode_found= Res->CoordList.ncoords;
    memcpy(Res->CoordList.Coords, Coords, nbytes);
    Res->CoordList.ncoords= ncoords;
    UdmLog(query, UDM_LOG_DEBUG,
          "Too few results: %d, Threshold: %d, group in ANY mode", 
          Res->CoordList.ncoords, threshold);
    UdmGroupByURLInternal(query, Res, UDM_MODE_ANY);
    if (Res->CoordList.ncoords > strict_mode_found)
      UdmVarListReplaceInt(&query->Conf->Vars, "StrictModeFound", strict_mode_found);
  }
  UDM_FREE(Coords);
}


void UdmGroupBySite(UDM_AGENT *query, UDM_RESULT *Res)
{
  UDM_URLDATA *src= Res->CoordList.Data + 1;
  UDM_URLDATA *dst= Res->CoordList.Data;
  UDM_URLDATA *end= Res->CoordList.Data + Res->CoordList.ncoords;
  uint4 count;
  
  if (!Res->CoordList.ncoords)
    return;

  for(count= Res->CoordList.Data[0].per_site; src < end; src++)
  {
    /* Group by site_id */
    if(dst->site_id == src->site_id)
    {
      count+= src->per_site;
      if (dst->coord > src->coord)
      {
        continue;
      }
      else if (dst->coord == src->coord)
      {
        if (dst->pop_rank > src->pop_rank)
        {
          continue;
        }
        else if (dst->pop_rank == src->pop_rank)
        {
          if (dst->url_id < src->url_id)
            continue;
        }
      }
      dst->url_id=        src->url_id;
      dst->coord=         src->coord;
      dst->last_mod_time= src->last_mod_time;
      dst->pop_rank=      src->pop_rank;
      dst->url=           src->url;
      dst->section=       src->section;
    }
    else
    {
      /* Next site */
      dst->per_site= count;
      *++dst= *src;
      count= src->per_site;
    }
  }
  dst->per_site= count;
  Res->CoordList.ncoords= dst - Res->CoordList.Data + 1;
  return;
}


/******** Convert category string into 32 bit number *************/

void UdmDecodeHex8Str(const char *hex_str, uint4 *hi,
                      uint4 *lo, uint4 *fhi, uint4 *flo)
{
  char str[33],str_hi[17],str_lo[17], *s = str;

  strncpy(str, hex_str, 13);
  str[12] = '\0';
  strcat(str,"000000000000");
  while(*s == '0') *s++ = ' ';
  strncpy(str_hi,&str[0],6); str_hi[6]=0;
  strncpy(str_lo,&str[6],6); str_lo[6]=0;
  
  *hi = (uint4)strtoul(str_hi, (char **)NULL, 36);
  *lo = (uint4)strtoul(str_lo, (char **)NULL, 36);

  if ((fhi != NULL) && (flo != NULL))
  {
    strncpy(str, hex_str, 13);
    str[12] = '\0';
    strcat(str,"ZZZZZZZZZZZZ");
    strncpy(str_hi, &str[0], 6); str_hi[6] = 0;
    strncpy(str_lo, &str[6], 6); str_lo[6] = 0;
  
    *fhi = strtoul(str_hi, (char **)NULL, 36);
    *flo = strtoul(str_lo, (char **)NULL, 36);
  }
}

int __UDMCALL UdmAddSearchLimit(UDM_AGENT *Agent, int type,
                                const char *file_name, const char *val)
{
  uint4 hi, lo, f_hi, f_lo;
  
  if(Agent->nlimits == MAX_SEARCH_LIMIT - 1) return(1);
  
  Agent->limits[Agent->nlimits].type = type;
  strcpy(Agent->limits[Agent->nlimits].file_name, file_name);
  switch(type)
  {
    case 0: UdmDecodeHex8Str(val, &hi, &lo, &f_hi, &f_lo); break;
    case 1: f_hi = hi = 0; f_lo = lo = 0; break;
    case 2: hi=atoi(val); lo=0; f_hi = hi; f_lo = lo; break;
    case 3: hi=UdmStrHash32(val); lo = 0; f_hi = hi; f_lo = 0; break;
  }  
  Agent->limits[Agent->nlimits].hi = hi;
  Agent->limits[Agent->nlimits].lo = lo;
  Agent->limits[Agent->nlimits].f_hi = f_hi;
  Agent->limits[Agent->nlimits].f_lo = f_lo;
  
  Agent->nlimits++;
  
  UdmLog(Agent, UDM_LOG_DEBUG, "val: %s  %x %x   %x %x", val, hi, lo, f_hi, f_lo);
  
  return(0);
}

int UdmParseQueryString(UDM_AGENT * Agent,
                        UDM_VARLIST * vars,char * query_string)
{
  char * tok, *lt;
  size_t len;
  char *str = (char *)UdmMalloc((len = strlen(query_string)) + 7);
  char *qs = (char*)UdmStrdup(query_string);
  char qname[256];

  if ((str == NULL) || qs == NULL)
  {
    UDM_FREE(str);
    UDM_FREE(qs);
    return 1;
  }

  UdmSGMLUnescape(qs);
  
  tok = udm_strtok_r(qs, "&", &lt);
  while(tok)
  {
    char empty[]="";
    char * val;
    const char * lim;
    
    if((val=strchr(tok,'=')))
    {
      *val='\0';
      val++;
    }
    else
    {
      val=empty;
    }
    UdmUnescapeCGIQuery(str,val);
    UdmVarListAddQueryStr(vars,tok,str);
    udm_snprintf(qname, 256, "query.%s", tok);
    UdmVarListAddQueryStr(vars, qname, str);
    
    sprintf(str,"Limit-%s",tok);
    if((lim=UdmVarListFindStr(vars,str,NULL)))
    {
      int ltype=0;
      const char * type, * fname = NULL;
      char * llt;
      strncpy(str, lim, len);
      
      if((type = udm_strtok_r(str, ":", &llt)))
      {
        if(!strcasecmp(type, "category"))
        {
          ltype = UDM_LIMTYPE_NESTED; fname = UDM_LIMFNAME_CAT;
        }
        else if(!strcasecmp(type, "tag"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_TAG;
        }
        else if(!strcasecmp(type, "time"))
        {
          ltype = UDM_LIMTYPE_TIME; fname = UDM_LIMFNAME_TIME;
        }
        else if(!strcasecmp(type, "hostname"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_HOST;
        }
        else if(!strcasecmp(type, "language"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_LANG;
        }
        else if(!strcasecmp(type, "content"))
        {
          ltype = UDM_LIMTYPE_LINEAR_CRC; fname = UDM_LIMFNAME_CTYPE;
        }
        else if(!strcasecmp(type, "siteid"))
        {
          ltype = UDM_LIMTYPE_LINEAR_INT; fname = UDM_LIMFNAME_SITE;
        }
        if((fname != NULL) && strlen(val))
        {
          UdmAddSearchLimit(Agent,ltype,fname,val);
        }
      }
    }
    tok = udm_strtok_r(NULL, "&", &lt);
  }
  UDM_FREE(str);
  UDM_FREE(qs);
  return 0;
}



static UDM_WIDEWORD*
UdmWordInWWList(UDM_WIDEWORDLIST *List, int *tok, size_t flen,
                int hlstop, size_t phrpos)
{
  size_t uw;
  
  for(uw=0; uw < List->nwords; uw++)
  {
    size_t slen;
    UDM_WIDEWORD *W= &List->Word[uw];
    if (W->phrpos != phrpos)
      continue;
    if (!hlstop && W->origin == UDM_WORD_ORIGIN_STOP)
      continue;
    slen= W->ulen;
    if (flen < slen)
      continue;
    if (flen == slen && !UdmUniStrNCaseCmp(tok, W->uword, slen))
      return W;
      
    if (flen > slen) switch (List->wm)
    {
      case UDM_MATCH_BEGIN:
        if (!UdmUniStrNCaseCmp(tok, W->uword, slen))
          return W;
        break;
      case UDM_MATCH_END:
        if (!UdmUniStrNCaseCmp(tok + flen - slen, W->uword, slen))
          return W;
        break;
      case UDM_MATCH_SUBSTR:
        {
          size_t l1, l2;
          for (l1 = 0; l1 < flen; l1++)
          {
            if (l1 + slen > flen) break;
            for (l2 = 0; l2 < slen; l2++)
            {
              if (UdmUniToLower(tok[l1 + l2]) != UdmUniToLower(W->uword[l2]))
                break;
            }
            if (l2 == slen)
            {
              return W;
              break;
            }
          }
        }
        break;
    }
  }
  return NULL;
}


/*
  Remove hilight markers from a string and return
  its new length, in bytes.
*/
static size_t
UdmRemoveHl(UDM_CHARSET *cs, char *str, size_t from, size_t to)
{
  if (cs == &udm_charset_sys_int)
  {
    int *stri= (int*) str;
    int *s= stri + from / sizeof(int);
    int *e= stri + to / sizeof(int);
    int *dst= s;
    for (; s < e; s++)
    { 
      if (*s != 2 && *s != 3)
        *dst++= *s;
    }
    return (dst - stri) * sizeof(int);
  }
  else
  {
    char *s= str + from, *e= str + to, *dst= s;
    for (; s < e; s++)
    { 
      if (*s != 2 && *s != 3)
        *dst++= *s;
    }
    return dst - str;
  }
}


static size_t
UdmHlAppend(UDM_CONV *uni_bc, UDM_WIDEWORD *found,
            char *dst, size_t dstlen, size_t dstmaxlen,
            int *tok, size_t toklen)
{
  int i2= 2, i3= 3;

  if (found)
  {
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i2, sizeof(i2));
  }
  if (uni_bc->to == &udm_charset_sys_int)
  {
    memcpy(dst + dstlen, tok, sizeof(*tok) * toklen);
    dstlen+= sizeof(*tok) * toklen;
  }
  else
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) tok, sizeof(*tok) * toklen);

  if (found)
    dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i3, sizeof(i3));

  /*fprintf(stderr, "appended to '%.*s'\n", dstlen, dst);*/

  return dstlen;
}

/*
#define DEBUG_HL 0
*/

/* Returns a 0-terminated string */

static char *
UdmHlConvertExtWithConv(UDM_WIDEWORDLIST *List, const char *src,
                        UDM_CONV *lc_uni, UDM_CONV *uni_bc,
                        int hilight_stopwords)
{
  int		*tok, *lt, ctype, *uni;
  int           i0= 0;
  char          *dst;
  size_t        srclen, dstlen= 0, dstlen_phr= 0;
  size_t        dstmaxlen, unimaxlen, expected_phrpos= 0;
  UDM_CHARSET   *sys_int= &udm_charset_sys_int;

#ifdef DEBUG_HL
  fprintf(stderr, "tocs=%s src='%s'\n", uni_bc->to->name, src);
#endif

  if(!src)return NULL;
  if ((srclen = strlen(src)) == 0) return NULL;
  
  dstmaxlen= srclen * 14 + 10;
  dst= (char*)UdmMalloc(dstmaxlen);

  /* Convert to unicode */
  unimaxlen= (srclen + 10) * sizeof(int);
  uni= (int *)UdmMalloc(unimaxlen);
  UdmConv(lc_uni,(char*)uni, unimaxlen, src, srclen + 1);

  /* Parse unicode string */
  for (tok= UdmUniGetSepToken(uni, &lt, &ctype) ; tok ;
       tok= UdmUniGetSepToken(NULL, &lt, &ctype))
  {
    size_t toklen= lt - tok;

    if (ctype == UDM_UNI_SEPAR || !List)
    {
      dstlen= UdmHlAppend(uni_bc, NULL, dst, dstlen, dstmaxlen, tok, toklen);
    }
    else
    {
      UDM_WIDEWORD *found;
      found= UdmWordInWWList(List, tok, toklen, hilight_stopwords, expected_phrpos);
      dstlen= UdmHlAppend(uni_bc, found, dst, dstlen, dstmaxlen, tok, toklen);
      if (found)
      {
        if (found->phrpos + 1 == found->phrlen)
        {
          /* last in phrase found */
          expected_phrpos= 0;
          dstlen_phr= dstlen;
        }
        else
        {
          /* middle in phrase found */
          expected_phrpos++;
        }
      }
      else
      {
        /* No word found on expected phrase position, rollback */
        dstlen= UdmRemoveHl(uni_bc->to, dst, dstlen_phr, dstlen);
        dstlen_phr= dstlen;
        expected_phrpos= 0;
      }
    }
  }


#ifdef DEBUG_HL  
  fprintf(stderr, "end: expected_phrpos=%d dstlen=%d dstlen_phr=%d\n", expected_phrpos, dstlen, dstlen_phr);
#endif
  if (expected_phrpos > 0)
  {
    /* Roll back: incomplete last phrase */
    dstlen= UdmRemoveHl(uni_bc->to, dst, dstlen_phr, dstlen);
  }

  dstlen+= UdmConv(uni_bc, dst + dstlen, dstmaxlen, (char*) &i0, sizeof(i0));

  UdmFree(uni);
  return dst;

}


char * UdmHlConvertExt(UDM_WIDEWORDLIST *List,const char * src,
                       UDM_CHARSET * lcs, UDM_CHARSET * bcs, int hlstop)
{
  UDM_CONV lc_uni, uni_bc;
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  return UdmHlConvertExtWithConv(List, src, &lc_uni, &uni_bc, hlstop);
}


/* For PHP module compatibility */
char * UdmHlConvert(UDM_WIDEWORDLIST *List,const char * src,
                    UDM_CHARSET * lcs, UDM_CHARSET * bcs)
{
  return UdmHlConvertExt(List, src, lcs, bcs, 1);
}


int UdmConvert(UDM_ENV *Conf, UDM_RESULT *Res,
               UDM_CHARSET *lcs, UDM_CHARSET *bcs)
{
  size_t i;
  UDM_CONV lc_bc, lc_uni, uni_bc;
  int hlstop= UdmVarListFindBool(&Conf->Vars, "ExcerptStopword", 1);

  /* Init converters */
  UdmConvInit(&lc_bc,lcs,bcs,UDM_RECODE_HTML);
  UdmConvInit(&lc_uni, lcs, &udm_charset_sys_int, UDM_RECODE_HTML);
  UdmConvInit(&uni_bc, &udm_charset_sys_int, bcs, UDM_RECODE_HTML);
  
  /* Convert word list */
  for(i=0;i<Res->WWList.nwords;i++)
  {
    UDM_WIDEWORD *W=&Res->WWList.Word[i];
    size_t len= strlen(W->word);
    char *newval= (char*)UdmMalloc(len * 12 + 1);
    
    UdmConv(&lc_bc,newval,len*12+1,W->word,len+1);
    UDM_FREE(W->word);
    W->word=newval;
  }
  
  /* Convert document sections */
  for(i=0;i<Res->num_rows;i++)
  {
    UDM_DOCUMENT  *D=&Res->Doc[i];
    size_t    sec;
    
    for(sec=0; sec < D->Sections.nvars; sec++)
    {
      UDM_VAR *Var= &D->Sections.Var[sec];
      
      /*
         A temporary fix to skip URL and CachedCopy:
         We will skip these sections for now.
         But this need a further fix in search.htm
         to distinguish two HTML formats:
         - HTML with &<>" escaped to &amp;&lt;&gt;&quot;
         - HTML with &<>" printed as is, no word hilight
         - HTML with &<>" printed as is, search word hilighted.
      */
      
      if (strcasecmp(Var->name,"URL") &&
          strcasecmp(Var->name,"CachedCopy") &&
          strcasecmp(Var->name,"Content-Type"))
      {
        char *newval= UdmHlConvertExtWithConv(&Res->WWList, Var->val,
                                              &lc_uni, &uni_bc, hlstop);
        UDM_FREE(Var->val);
        Var->val= newval;
      }
    }
  }
  
  /* Convert Env->Vars */
  for (i = 0; i < Conf->Vars.nvars; i++)
  {
    UDM_VAR *Var= &Conf->Vars.Var[i];
    if (UdmVarType(Var) == UDM_VAR_STR &&
        strcasecmp(Var->name, "HlBeg") &&
        strcasecmp(Var->name, "HlEnd"))
    {
      size_t len= strlen(Var->val);
      char *newval= (char*)UdmMalloc(len * 12 + 1);
    
      UdmConv(&lc_bc, newval, len * 12 + 1, Var->val, len + 1);
      UDM_FREE(Var->val);
      Var->val= newval;
    }
  }
  
  return UDM_OK;
}


static char rm_hl_special[256]=
{
/*00*/  1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
/*10*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*20*/  0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,  /*  !"#$%&'()*+,-./ */
/*30*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0123456789:;<=>? */
/*40*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* @ABCDEFGHIJKLMNO */
/*50*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* PQRSTUVWXYZ[\]^_ */
/*60*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* `abcdefghijklmno */
/*70*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* pqrstuvwxyz{|}~  */
/*80*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*90*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*A0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*B0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*C0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*D0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*E0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
/*F0*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};


char* UdmRemoveHiLightDup(const char *s)
{
  size_t len=strlen(s)+1;
  char   *d, *res = (char*)UdmMalloc(len);
  
  for(d= res; ; s++)
  {
    if ((((unsigned char) *s) <= '&') &&
        rm_hl_special[(unsigned char) *s])
    {
      switch(s[0])
      {
        case '\0':
          goto ex;
        case '\2':
        case '\3':
          break;
        case '&':
          if (s[1] == '#')
          {
            const char *e;
            int code= 0;
          
            for (e= s+2; (*e >= '0') && (*e <= '9'); code= code*10 + e[0]-'0', e++);
            if (*e == ';')
            {
              *d++= (code < 128) ? code : '?';
              s= e;
              break;
            }
          }
          /* pass through */
        
        default:
          *d++=*s;
      }
    }
    else
      *d++= *s;
  }
ex:
  *d='\0';
  return res;
}



int UdmCatToTextBuf(UDM_CATEGORY *C, char *textbuf, size_t len)
{
  char  *end;
  size_t  i;
  
  textbuf[0]='\0';
  
  end = textbuf;
  
  for(i = 0; i < C->ncategories; i++)
  {
    udm_snprintf(end, len - strlen(textbuf),
                 "<CAT\tid=\"%d\"\tpath=\"%s\"\tlink=\"%s\"\tname=\"%s\">\r\n",
                 C->Category[i].rec_id, C->Category[i].path,
                 C->Category[i].link, C->Category[i].name);
    end = end + strlen(end);
  }
  return UDM_OK;
}

int UdmCatFromTextBuf(UDM_CATEGORY *C, char *textbuf)
{
  const char  *htok, *last;
  UDM_HTMLTOK  tag;
  size_t    i, c;
  
  if (textbuf == NULL) return UDM_OK;
  UdmHTMLTOKInit(&tag);
  
  htok=UdmHTMLToken(textbuf,&last,&tag);
  
  if(!htok || tag.type != UDM_HTML_TAG)
    return UDM_OK;

  C->Category = (UDM_CATITEM*)UdmRealloc(C->Category, sizeof(UDM_CATITEM) * ((c = C->ncategories) + 1));
  bzero((void*)&C->Category[c], sizeof(UDM_CATITEM));
  
  for(i = 1; i < tag.ntoks; i++)
  {
    size_t  nlen = tag.toks[i].nlen;
    size_t  vlen = tag.toks[i].vlen;
    char  *name = UdmStrndup(tag.toks[i].name, nlen);
    char  *data = UdmStrndup(tag.toks[i].val, vlen);

    if (!strcmp(name, "id"))
    {
      C->Category[c].rec_id = atoi(data);
    }
    else if (!strcmp(name, "path"))
    {
      strncpy(C->Category[c].path, data, 128);
    }
    else if (!strcmp(name, "link"))
    {
      strncpy(C->Category[c].link, data, 128);
    }
    else if (!strcmp(name, "name"))
    {
      strncpy(C->Category[c].name, data, 128);
    }

    UDM_FREE(name);
    UDM_FREE(data);
  }

  C->ncategories++;
  return UDM_OK;
}

int *UdmUniSegment(UDM_AGENT *Indexer, int *ustr, const char *lang)
{
  UDM_CHARSET  *sys_int= &udm_charset_sys_int;
  size_t       reslen, dstlen = UdmUniLen(ustr);
  const char   *seg=  UdmVarListFindStr(&Indexer->Conf->Vars, "Segmenter", NULL);

#ifdef CHASEN
  if ((!seg  || !strcasecmp(seg, "Chasen")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    char        *eucstr, *eucstr_seg;
    UDM_CHARSET *eucjp_cs;
    UDM_CONV    uni_eucjp, eucjp_uni;
    
    eucjp_cs = UdmGetCharSet("euc-jp");
    if (!eucjp_cs) eucjp_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_eucjp, sys_int, eucjp_cs, UDM_RECODE_HTML);
    UdmConvInit(&eucjp_uni, eucjp_cs, sys_int, UDM_RECODE_HTML);
    eucstr = (char*)UdmMalloc(12 * dstlen + 1);
    UdmConv(&uni_eucjp, eucstr, 12 * dstlen + 1, (char*)ustr, sizeof(*ustr)*(dstlen + 1));
    
    UDM_GETLOCK(Indexer, UDM_LOCK_SEGMENTER);
    eucstr_seg = chasen_sparse_tostr(eucstr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_SEGMENTER);
    
    reslen = strlen(eucstr_seg) + 1;
    ustr = (int*)UdmRealloc(ustr, reslen * sizeof(int));
    UdmConv(&eucjp_uni, (char*)ustr, reslen * sizeof(int), eucstr_seg, reslen);
    UDM_FREE(eucstr);
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif


#ifdef MECAB
  if ((!seg  || !strcasecmp(seg, "Mecab")) &&
      (!lang || !strncasecmp(lang, "ja", 2)))
  {
    UDM_CHARSET *sjis_cs;
    UDM_CONV    uni_sjis, sjis_uni;
    char        *sjisstr, *sjisstr_seg;

    sjis_cs = UdmGetCharSet("euc-jp");
    if (!sjis_cs) sjis_cs = &udm_charset_sys_int;
    UdmConvInit(&uni_sjis, sys_int, sjis_cs, UDM_RECODE_HTML);
    UdmConvInit(&sjis_uni, sjis_cs, sys_int, UDM_RECODE_HTML);

    sjisstr = (char*)UdmMalloc(12 * dstlen + 1);
    UdmConv(&uni_sjis, sjisstr, 12 * dstlen + 1, (char*)ustr, sizeof(*ustr) * (dstlen + 1));

    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
#ifdef HAVE_PTHREADS
    mecab_lock(Indexer->Conf->mecab);
#endif
    sjisstr_seg = mecab_sparse_tostr(Indexer->Conf->mecab, sjisstr);
#ifdef HAVE_PTHREADS
    mecab_unlock(Indexer->Conf->mecab);
#endif
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

    reslen = strlen(sjisstr_seg) + 1;
    ustr = (int*)UdmRealloc(ustr, reslen * sizeof(int));
    UdmConv(&sjis_uni, (char*)ustr, reslen * sizeof(int), sjisstr_seg, reslen);
    UDM_FREE(sjisstr);
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif


#ifdef HAVE_CHARSET_gb2312
  if ((!seg  || !strcasecmp(seg, "Freq")) && Indexer->Conf->Chi.nwords &&
      (!lang || !lang[0] || 
       !strncasecmp(lang, "zh", 2) ||
       !strncasecmp(lang, "cn", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Chi, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    dstlen = UdmUniLen(ustr);
    return ustr;
  }
#endif

  if ((!seg  || !strcasecmp(seg, "Freq")) && Indexer->Conf->Thai.nwords &&
      (!lang || !strncasecmp(lang, "th", 2)))
  {
    int *seg_ustr;
    UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
    seg_ustr = UdmSegmentByFreq(&Indexer->Conf->Thai, ustr);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    if (seg_ustr != NULL)
    {
      UDM_FREE(ustr);
      ustr = seg_ustr;
    }
    dstlen = UdmUniLen(ustr);
    return ustr;
  }

  return ustr;
}


int UdmResWordInfo(UDM_ENV *Env, UDM_RESULT *Res)
{
  size_t  len, i, j, wsize;
  char  *wordinfo= NULL, *end;
  size_t  corder= (size_t)-1, ccount= 0;
  int have_suggestions= 0;
  
  for(len = i = 0; i < Res->WWList.nwords; i++) 
    len += Res->WWList.Word[i].len + 64;
  
  wsize= (1+len)*sizeof(char);
  wordinfo= (char*) UdmMalloc(wsize);
  *wordinfo= '\0';
  
  for(i = 0; i < Res->WWList.nwords; i++)
  {
    char name[32], count[32];
    sprintf(name, "wrd%d", i);
    if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_QUERY ||
        Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_SPELL ||
        Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_SYNONYM)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : %d", Res->WWList.Word[i].word, Res->WWList.Word[i].count);
      sprintf(count, "%d", Res->WWList.Word[i].count);
    }
    else if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      if(wordinfo[0])
        strcat(wordinfo,", ");
      sprintf(UDM_STREND(wordinfo)," %s : stopword", Res->WWList.Word[i].word);
      strcpy(count, "stopword");
    }
    sprintf(name, "word%d.word", i);
    UdmVarListAddStr(&Env->Vars, name, Res->WWList.Word[i].word);
    sprintf(name, "word%d.count", i);
    UdmVarListAddStr(&Env->Vars, name, count);
  }
  
  UdmVarListReplaceStr(&Env->Vars, "WE", wordinfo);
  
  *wordinfo = '\0';
  for(i = 0; i < Res->WWList.nwords; i++)
  {
    corder= Res->WWList.Word[i].order;
    ccount= 0;
    for(j= 0; j < Res->WWList.nwords; j++)
      if (Res->WWList.Word[j].order == corder)
        ccount += Res->WWList.Word[j].count;
    if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : stopword", (*wordinfo) ? ", " : "",  Res->WWList.Word[i].word);
    }
    else if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_QUERY)
    {
      sprintf(UDM_STREND(wordinfo),"%s%s : %d / %d", 
        (*wordinfo) ? ", " : "", Res->WWList.Word[i].word, Res->WWList.Word[i].count, ccount);
    }
  }
  UdmVarListReplaceStr(&Env->Vars, "W", wordinfo);
  
  *wordinfo= '\0';
  end= wordinfo;
  for (i= 0; i < Res->WWList.nwords; i++)
  {
    UDM_WIDEWORD *Wi= &Res->WWList.Word[i];
    UDM_WIDEWORD *Wb= NULL;
   
    if (Wi->origin == UDM_WORD_ORIGIN_QUERY)
    {
      if (Wi->count > 0)
      {
        Wb= Wi;
      }
      else
      {
        ccount= 0;
        for (j= 0; j < Res->WWList.nwords; j++)
        {
          UDM_WIDEWORD *Wj= &Res->WWList.Word[j];
          if (Wj->origin == UDM_WORD_ORIGIN_SUGGEST &&
              Wj->order == Wi->order && Wj->count > ccount)
          {
            ccount= Res->WWList.Word[j].count;
            Wb= Wj;
            have_suggestions= 1;
          }
        }
      }
    }
    else if (Wi->origin == UDM_WORD_ORIGIN_STOP)
    {
      Wb= Wi;
    }
    
    if (Wb)
    {
      sprintf(end, "%s%s", wordinfo[0] ? " " : "", Wb->word);
      end= end + strlen(end);
    }
  }
  
  if (have_suggestions)
    UdmVarListReplaceStr(&Env->Vars, "WS", wordinfo);
  UDM_FREE(wordinfo);
  return UDM_OK;
}
