/* Copyright (C) 2003-2006 Datapark corp. All rights reserved.
   Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/
#include "dps_common.h"
#include "dps_utils.h"
#include "dps_proto.h"
#include "dps_url.h"
#include "dps_hrefs.h"
#include "dps_server.h"
#include "dps_xmalloc.h"
#include "dps_host.h"
#include "dps_vars.h"
#include "dps_wild.h"
#include "dps_match.h"
#include "dps_db.h"
#include "dps_log.h"
#include "dps_charsetutils.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <ctype.h>

#define ERRSTRSIZ 1000
#define M_SERVERS_ADD 64 /*(4096 / sizeof(DPS_SERVER))*/ /* FIXME: use page size instead of 4096 for your platform */

/*
#define TRACE_SRVS 1
*/

static size_t dps_max_server_ordre = 0;

/* return values: 0 on success, non-zero on error */

__C_LINK int __DPSCALL DpsServerAdd(DPS_AGENT *A, DPS_SERVER *srv){
	int		res;
	int		add = 1;
	DPS_URL		from;
	char		*urlstr;
	DPS_SERVER	*new = NULL;
	DPS_SERVERLIST  *List;
	size_t		i, len;
	DPS_ENV         *Conf = A->Conf;
	size_t          idx;
#ifdef WITH_PARANOIA
	void * paran = DpsViolationEnter(paran);
#endif

	if (srv == NULL) {
#ifdef WITH_PARANOIA
	  DpsViolationExit(paran);
#endif
	  return DPS_ERROR;
	}
	idx = (size_t)srv->Match.match_type;
	if (idx >= DPS_MATCH_max) {
#ifdef WITH_PARANOIA
	  DpsViolationExit(paran);
#endif
	  return DPS_ERROR;
	}
	List = &Conf->Servers[idx];

	/* Copy URL to temp string    */
	/* to keep srv->url unchanged */
	len = dps_strlen(DPS_NULL2EMPTY(srv->Match.pattern)) + 4;
	if ((urlstr = (char*)DpsMalloc(len + 1)) == NULL) {
	  DpsLog(A, DPS_LOG_ERROR, "Can't alloc %d bytes at "__FILE__":%d", len, __LINE__);
#ifdef WITH_PARANOIA
	  DpsViolationExit(paran);
#endif
	  return DPS_ERROR;
	}
	dps_strcpy(urlstr, DPS_NULL2EMPTY(srv->Match.pattern));
	
	from.freeme = 0;
	DpsURLInit(&from);

	if((idx == DPS_MATCH_BEGIN) && (urlstr[0])){
		int follow;  
		
		/* Check whether valid URL is passed */
		if((res=DpsURLParse(&from,urlstr))){
			switch(res){
				case DPS_URL_LONG:
				  DpsLog(A, DPS_LOG_ERROR, "URL too long");
				  break;
				case DPS_URL_BAD:
				  DpsLog(A, DPS_LOG_ERROR, "Badly formed URL");
				  break;
				default:
				  DpsLog(A, DPS_LOG_ERROR, "Error while parsing URL");
				  break;
			}
			DpsURLFree(&from);
			DPS_FREE(urlstr);
#ifdef WITH_PARANOIA
			DpsViolationExit(paran);
#endif
			return(DPS_ERROR);
		}
		if((from.hostinfo) && (from.filename == NULL)) {
			/* Add trailing slash                    */
			/* http://localhost -> http://localhost/ */
			dps_snprintf(urlstr, len, "%s://%s%s", from.schema, from.hostinfo, DPS_NULL2EMPTY(from.path));
		}
		
		switch(follow=DpsVarListFindInt(&srv->Vars,"Follow",DPS_FOLLOW_PATH)){
			char * s, * anchor;
			case DPS_FOLLOW_PATH:
				/* Cut before '?' and after last '/' */
				if((anchor=strchr(urlstr,'?')))
					*anchor='\0';
				if((s=strrchr(urlstr,'/')))
					*(s+1)='\0';
				break;

			case DPS_FOLLOW_SITE:
				if (from.hostinfo != NULL) {
					/* Cut after hostinfo */
					dps_snprintf(urlstr, len, "%s://%s/", DPS_NULL2EMPTY(from.schema), from.hostinfo);
				}else{
					/* Cut after first '/' */
					if((s=strchr(urlstr,'/')))
						*(s+1)='\0';
				}
				break;
			
			case DPS_FOLLOW_NO: 
			case DPS_FOLLOW_WORLD:
			default:
				break;
		}
		if ( !strcmp(DPS_NULL2EMPTY(from.schema), "news") ) {
			char *c, *cc;
			/* Cat server name to remove group names */
			/* This is because group names do not    */
			/* present in message URL                */
			c=urlstr+7;
			cc=strchr(c,'/');
			if(cc)*(cc+1)='\0';
		}
	}else
	if( idx == DPS_MATCH_REGEX) {
		int err;
		char regerrstr[ERRSTRSIZ]="";
		
		if(DPS_OK!=(err=DpsMatchComp(&srv->Match,regerrstr,sizeof(regerrstr)-1))){
			dps_snprintf(Conf->errstr,sizeof(Conf->errstr),"Wrong regex in config file: %s: %s", urlstr,regerrstr);
			DPS_FREE(urlstr);
			DpsURLFree(&from);
#ifdef WITH_PARANOIA
			DpsViolationExit(paran);
#endif
			return(DPS_ERROR);
		}
	}
	for (i = 0; i < List->nservers; i++) {
		if (strcmp(List->Server[i].Match.pattern, urlstr) == 0) {
			add = 0;
			new = &List->Server[i];
			break;
		}
	}

	if (add) {
	  List->sorted = 0;
	  if(List->nservers >= List->mservers) {
	    List->mservers += M_SERVERS_ADD;
	    List->Server = (DPS_SERVER *)DpsRealloc(List->Server, List->mservers * sizeof(DPS_SERVER));
	    if (List->Server == NULL) {
	      DpsLog(A, DPS_LOG_ERROR, "Cant' realloc %d bytes at "__FILE__":%d", List->mservers * sizeof(DPS_SERVER), __LINE__);
	      List->nservers = List->mservers = 0;
#ifdef WITH_PARANOIA
	      DpsViolationExit(paran);
#endif
	      return DPS_ERROR;
	    }
	  }
	  new = &List->Server[List->nservers];
	  DpsServerInit(new);
/**/
	  DpsVarListReplaceLst(&new->Vars, &srv->Vars, NULL, "*");
	
	  new->Match.pattern = (char*)DpsStrdup(urlstr);
	  new->Match.nomatch = srv->Match.nomatch;
	  new->Match.case_sense = srv->Match.case_sense;
	  new->Match.match_type = srv->Match.match_type;
	  new->Match.reg = srv->Match.reg;
	  new->Match.arg = srv->Match.arg;
	  srv->Match.reg = NULL;
	  srv->Match.arg = NULL;
	  new->command = srv->command;
	  new->ordre = srv->ordre;
	  new->weight = srv->weight;
	  new->MaxHops = srv->MaxHops;
	  new->MaxDepth = srv->MaxDepth;
	  new->MaxDocsPerServer = srv->MaxDocsPerServer;
	  new->ExpireAt = srv->ExpireAt;
	  if (List->nservers == 0) List->min_ordre = srv->ordre;
	  new->crawl_delay = srv->crawl_delay;
	  if (srv->last_crawled == NULL) {
	    new->last_crawled = (time_t*)DpsMalloc(sizeof(time_t));
	    if (new->last_crawled == NULL) {
	      DpsLog(A, DPS_LOG_ERROR, "Cant' alloc %d bytes at "__FILE__":%d", sizeof(time_t), __LINE__);
#ifdef WITH_PARANOIA
	      DpsViolationExit(paran);
#endif
	      return DPS_ERROR;
	    }
	    new->need_free = 1;
	  } else {
	    new->last_crawled = srv->last_crawled;
	    new->need_free = 0;
	  }

#ifdef TRACE_SRVS
	  fprintf(stderr, " command:%c  match_type:%d  pattern: %s\n", srv->command, srv->Match.match_type, urlstr);
#endif
	
	  res = DpsSrvAction(A, new, DPS_SRV_ACTION_ADD);
	
	  List->nservers++;
	  if (new->ordre > dps_max_server_ordre) dps_max_server_ordre = new->ordre;
	
	} else {
/*	  DPS_FREE(new->Match.pattern);*/
	}

/**/
	srv->site_id = new->site_id;
	
	DPS_FREE(urlstr);
	DpsURLFree(&from);
#ifdef WITH_PARANOIA
	DpsViolationExit(paran);
#endif
	return(res);
}

void DpsServerFree(DPS_SERVER *Server){
	DpsMatchFree(&Server->Match);
	DpsVarListFree(&Server->Vars);
	if (Server->need_free) {
	  DPS_FREE(Server->last_crawled);
	  Server->need_free = 0;
	}
}

void DpsServerListFree(DPS_SERVERLIST *List){
	size_t i;
	
	for(i=0;i<List->nservers;i++)
		DpsServerFree(&List->Server[i]);
	
	List->nservers=List->mservers=0;
	DPS_FREE(List->Server);
}

/* This fuction finds Server entry for given URL         */
/* and return Alias in "aliastr" if it is not NULL       */
/* "aliastr" must be big enough to store result          */
/* not more than DPS_URLSTR bytes are written to aliastr */

DPS_SERVER * DpsServerFind(DPS_AGENT *Agent, const char *url, int charset_id, char **aliastr) {
#define NS 10
  DPS_MATCH_PART P[NS];
  DPS_SERVERLIST *List;	
  size_t	 i, cur_idx = dps_max_server_ordre, tix;
  char		 *robots = NULL;
  DPS_SERVER	 *Res = NULL;
  DPS_CONN       conn;
  char           net[32];
	
  /* If it's a robot.txt, cut to hostinfo and find result */
  if((robots=strstr(url,"/robots.txt"))){
    if(!strcmp(robots,"/robots.txt")){
      robots = (char*)DpsStrdup(url);
      robots[dps_strlen(url)-10]='\0';
    }else{
      robots=NULL;
    }
  }
	
  net[0] = '\0';
/*  fprintf(stderr, "FindServer for URL: %s [max_server_ordre:%d]\n", url, dps_max_server_ordre);*/

  for (tix = DPS_MATCH_min; tix < DPS_MATCH_max; tix++) {
    List = &Agent->Conf->Servers[tix];
    if (List->nservers == 0) continue;
    if (List->min_ordre > cur_idx) continue;

    if (tix == DPS_MATCH_SUBNET) {
      DPS_URL  *URL = DpsURLInit(NULL);
	
      if (URL == NULL) continue;
			
      if(DpsURLParse(URL, url)) {
	DpsURLFree(URL);
	continue;
      }
      bzero(&conn, sizeof(conn));
      conn.hostname = URL->hostname;
      conn.port=80;
      conn.charset_id = charset_id;
      if (DpsHostLookup(Agent, &conn) != -1) {
	inet_ntop(AF_INET, &conn.sin.sin_addr, net, sizeof(net));
/*	unsigned char * h;
	h = (unsigned char*)(&conn.sin.sin_addr);
	dps_snprintf(net, sizeof(net) - 1, "%d.%d.%d.%d", h[0], h[1], h[2], h[3]);*/
      }
      DpsURLFree(URL);
    }

    for(i = 0; (i < List->nservers) && (List->Server[i].ordre <= cur_idx); i++) {
      DPS_SERVER      *srv = &List->Server[i];
      const char      *alias = DpsVarListFindStr(&srv->Vars,"Alias",NULL);
      size_t          aliastrlen;
      int             follow = DpsVarListFindInt(&srv->Vars, "Follow", DPS_FOLLOW_PATH);
      
/*      fprintf(stderr, "FindServer pattern: %s\n", DPS_NULL2EMPTY(srv->Match.pattern));*/

      if(follow == DPS_FOLLOW_WORLD || !DpsMatchExec(&srv->Match, url, net, &conn.sin, NS, P) ) {
	cur_idx = srv->ordre;
	Res = srv;
	if((aliastr != NULL) && (alias != NULL)) {
	  aliastrlen = 128 + dps_strlen(url) + dps_strlen(alias) + dps_strlen(srv->Match.pattern);
	  *aliastr = (char*)DpsMalloc(aliastrlen + 1);
	  if (*aliastr != NULL)
	    DpsMatchApply(*aliastr, aliastrlen, url, alias, &srv->Match, 10, P);
	}
	break;
      }
    }
/*    fprintf(stderr, "tix: %s -- cur_idx: %d, i:%d nserver:%d  lastORDRE:%d\n", DpsMatchTypeStr(tix), cur_idx, i, List->nservers, 
	    (List->nservers > 0) ? List->Server[List->nservers-1].ordre : 0);
    if (i < List->nservers) fprintf(stderr, "\t\tServer[i].ordre:%d\n", List->Server[i].ordre);*/
  }
  DPS_FREE(robots);
  return(Res);
}

#if 0
static int cmpserver(const void *s1,const void *s2){
	int res;
	
	if(!(res=dps_strlen(((const DPS_SERVER*)s2)->url)-dps_strlen(((const DPS_SERVER*)s1)->url)))
		res=(((const DPS_SERVER*)s2)->rec_id)-(((const DPS_SERVER*)s1)->rec_id);
	return(res);
}
void DpsServerListSort(DPS_SERVERLIST *List){
	/*  Long name should be found first    */
	/*  to allow different options         */
	/*  for server and it's subdirectories */
	if (List->nservers) DpsSort(List->Server, List->nservers, sizeof(DPS_SERVER), cmpserver);
}
#endif

 int DpsSpiderParamInit(DPS_SPIDERPARAM *Spider){
  size_t i;
	for (i = 0; i < DPS_DEFAULT_MAX_HOPS; i++) {
	  Spider->period[i] = DPS_DEFAULT_REINDEX_TIME;
	}
	Spider->max_net_errors = DPS_MAXNETERRORS;
	Spider->read_timeout = DPS_READ_TIMEOUT;
	Spider->doc_timeout = DPS_DOC_TIMEOUT;
	Spider->maxhops = DPS_DEFAULT_MAX_HOPS;
	Spider->index = 1;
	Spider->follow = DPS_FOLLOW_PATH;
	Spider->use_robots = 1;
	Spider->use_clones = 1;
	Spider->net_error_delay_time=DPS_DEFAULT_NET_ERROR_DELAY_TIME;
	Spider->ExpireAt.eight = 0;
	return DPS_OK;
}

__C_LINK int __DPSCALL DpsServerInit(DPS_SERVER * srv){
	bzero((void*)srv, sizeof(*srv));
	srv->Match.match_type=DPS_MATCH_BEGIN;
	srv->weight = 1;                       /* default ServerWeight */
	srv->MaxHops = DPS_DEFAULT_MAX_HOPS;   /* default MaxHops value */
	srv->MaxDepth = DPS_DEFAULT_MAX_DEPTH; /* default MaxDepth value */
	srv->MaxDocsPerServer = (dps_uint4)-1; /* default MaxDocsPerServer value */
	srv->ndocs = 0;                        /* no docs indexed */
	srv->use_robots = 1;
	return(0);
}

urlid_t DpsServerGetSiteId(DPS_AGENT *Indexer, DPS_SERVER *srv, DPS_DOCUMENT *Doc) {
  char *urlstr;
  DPS_SERVER S;
  int rc, i;
  urlid_t id = 0;
  int follow = DpsVarListFindInt(&srv->Vars,"Follow",DPS_FOLLOW_PATH);
  char *url, *psite;
/*  
  if ((srv->Match.match_type == DPS_MATCH_BEGIN) &&
  	(srv->Match.nomatch == 0) &&
        (follow == DPS_FOLLOW_SITE)) {
    return srv->site_id;
  }
*/
  url = DpsVarListFindStr(&Doc->Sections, "E_URL", NULL);

  if (url == NULL) {
    if((urlstr = (char*)DpsMalloc(dps_strlen(DPS_NULL2EMPTY(Doc->CurURL.schema))+dps_strlen(DPS_NULL2EMPTY(Doc->CurURL.hostname)) + 10)) == NULL) {
      return 0;
    }
    sprintf(urlstr, "%s://%s/", DPS_NULL2EMPTY(Doc->CurURL.schema), DPS_NULL2EMPTY(Doc->CurURL.hostname));
  } else {
    register char *p;
    if((urlstr = (char*)DpsMalloc(dps_strlen(url) + 2)) == NULL) {
      return 0;
    }
    dps_strcpy(urlstr, url);
    if ((p = strstr(urlstr, "://")) == NULL) {DPS_FREE(urlstr); return 0; }
    if ((p = strchr(p + 3, '/')) == NULL) {DPS_FREE(urlstr); return 0; }
    p[1] = '\0';
  }

  psite = urlstr;

  {
    char *e = urlstr + dps_strlen(urlstr) - 2, *pd = e;
    int level = 0, have_three = 0;
    for(; e > urlstr; --e) {
      if (*e == '.') {
	if (level == 1 && have_three == 0) {
	  if (pd - e < 5) have_three++;
	  else level++;
	} else level++;
	pd = e;
	if (level == Indexer->Flags.MaxSiteLevel) {
	  dps_memmove(e - 6, "http://", 7);
	  psite = e - 6;
	  break;
	}
      } else if (*e == '/') break;
    }
  }

  {
    register size_t ii;
    for (ii = 0; ii < dps_strlen(psite); ii++) psite[ii] = dps_tolower(psite[ii]);
  }

  for (i = 0; i < DPS_SITEID_CACHE_SIZE; i++) {
    if (!strcmp(DPS_NULL2EMPTY(Indexer->SiteIdCache[i]), psite)) {
      id = Indexer->SiteIdCacheId[i];
      break;
    }
  }

  if (id != 0) {
    DPS_FREE(urlstr);
    return id;
  }

  bzero((void*)&S, sizeof(S));
  S.Match.pattern     = psite;
  S.Match.match_type  = DPS_MATCH_BEGIN;
  S.Match.nomatch     = 0;
  S.command = 'S';
  S.ordre = srv->ordre;
  S.parent = srv->site_id;
  rc = DpsSrvAction(Indexer, &S, DPS_SRV_ACTION_ID);

  DPS_FREE(Indexer->SiteIdCache[Indexer->pSiteIdCache]);
  Indexer->SiteIdCache[Indexer->pSiteIdCache] = (char*)DpsStrdup(psite);
  Indexer->SiteIdCacheId[Indexer->pSiteIdCache] = S.site_id;
  Indexer->pSiteIdCache = (Indexer->pSiteIdCache + 1) % DPS_SITEID_CACHE_SIZE;

  DPS_FREE(urlstr);
  return (rc == DPS_OK) ? S.site_id : 0;
}
