/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
#include <pbs_config.h>   /* the master config generated by configure */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <memory.h>
#include <netdb.h>
#include <pwd.h>
#include <time.h>
#include <unistd.h>

#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <dirent.h>
#ifdef _CRAY
#include <sys/category.h>
#endif	/* _CRAY */
#include <sys/time.h>
#include <sys/resource.h>

#include "pbs_ifl.h"
#include "log.h"
#include "list_link.h"
#include "attribute.h"
#include "server_limits.h"
#include "server.h"
#include "queue.h"
#include "job.h"
#include "work_task.h"
#include "tracking.h"
#include "svrfunc.h"
#include "acct.h"
#include "pbs_version.h"
#include "rpp.h"

#ifndef SIGKILL
/* there is some weid stuff in gcc include files signal.h & sys/params.h */
#include <signal.h>
#endif

static char ident[] = "@(#) $RCSfile: pbsd_init.c,v $ $Revision: 2.3 $";

/* global Data Items */

extern char	*msg_startup3;
extern char     *msg_daemonname;
extern char	*msg_init_abt;
extern char	*msg_init_queued;
extern char	*msg_init_substate;
extern char	*msg_err_noqueue;
extern char	*msg_err_malloc;
extern char	*msg_init_noqueues;
extern char	*msg_init_recovque;
extern char	*msg_init_expctq;
extern char	*msg_init_nojobs;
extern char	*msg_init_exptjobs;
extern char	*msg_init_norerun;
extern char	*msg_init_unkstate;
extern char	*msg_init_baddb;
extern char	*msg_init_chdir;
extern char	*msg_init_badjob;
extern char	*msg_script_open;

extern char	*acct_file;
extern char	*log_file;
extern char	*path_home;
extern char	*path_acct;
extern char	path_log[];
extern char	*path_priv;
extern char	*path_jobs;
extern char	*path_queues;
extern char	*path_spool;
extern char	*path_svrdb;
extern char	*path_svrdb_new;
extern char	*path_track;
extern char	*path_nodes;
extern char	*path_nodes_new;
extern char	*path_nodestate;

extern int	 queue_rank;
extern char	 server_name[];
extern int	 svr_delay_entry;
extern list_head svr_newjobs;
extern list_head svr_alljobs;
extern list_head svr_queues;
extern list_head svr_requests;
extern list_head task_list_immed;
extern list_head task_list_timed;
extern list_head task_list_event;
extern time_t	 time_now;

extern struct server server;

/* External Functions Called */

extern void   on_job_exit A_((struct work_task *));
extern void   on_job_rerun A_((struct work_task *));
extern void   set_resc_assigned A_((job *, enum batch_op));


/* Private functions in this file */

static void  init_abt_job A_((job *));
static char *build_path A_((char *parent, char *name, char *subdir));
static void  catch_child A_((int));
static void  change_logs A_((int));
static int   chk_save_file A_((char *filename));
static void  need_y_response A_((int));
static void  pbsd_init_job A_((job *pjob, int type));
static void  pbsd_init_reque A_((job *job, int change_state));
static void  resume_net_move A_((struct work_task *));
static void  rm_files A_((char *dirname));
static void  stop_me A_((int));

/* private data */

#define CHANGE_STATE 1
#define KEEP_STATE   0

/*
 * This file contains the functions to initialize the PBS Batch Server.
 * The code is called once when the server is brought up.
 */

int pbsd_init(type)
	int   type;		/* type of initialization   */
{
	int	a_opt = -1;
	int	baselen;
	char	basen[MAXPATHLEN+1];
	struct dirent *pdirent;
	DIR	*dir;
	int	 fd;
	int	 had;
	int	 i;
	static char id[] = "pbsd_init";
	char	*job_suffix = JOB_FILE_SUFFIX;
	int	 job_suf_len = strlen(job_suffix);
	int	 logtype;
	char	*new_tag = ".new";
	job	*pjob;
	pbs_queue *pque;
	char	*psuffix;
	int	 rc;
	struct stat statbuf;
	char	*suffix_slash = "/";
	struct sigaction act;
	struct sigaction oact;

	/* The following is code to reduce security risks                */

	if (setup_env(PBS_ENVIRON)==-1) 
		return (-1);
	i = getgid();
	(void)setgroups(1, (gid_t *)&i);	/* secure suppl. groups */
	i = sysconf(_SC_OPEN_MAX);
	while (--i < 2)
		(void)close(i); /* close any file desc left open by parent */

#ifndef DEBUG
#ifdef _CRAY
	(void)limit(C_JOB,      0, L_CPROC, 0);
	(void)limit(C_JOB,      0, L_CPU,   0);
	(void)limit(C_JOBPROCS, 0, L_CPU,   0);
	(void)limit(C_PROC,     0, L_FD,  255);
	(void)limit(C_JOB,      0, L_FSBLK, 0);
	(void)limit(C_JOBPROCS, 0, L_FSBLK, 0);
	(void)limit(C_JOB,      0, L_MEM  , 0);
	(void)limit(C_JOBPROCS, 0, L_MEM  , 0);
#else	/* not  _CRAY */
	{
	struct rlimit rlimit;

	rlimit.rlim_cur = RLIM_INFINITY;
	rlimit.rlim_max = RLIM_INFINITY;
	(void)setrlimit(RLIMIT_CPU,   &rlimit);
	(void)setrlimit(RLIMIT_FSIZE, &rlimit);
	(void)setrlimit(RLIMIT_DATA,  &rlimit);
	(void)setrlimit(RLIMIT_STACK, &rlimit);
#ifdef	RLIMIT_RSS
	(void)setrlimit(RLIMIT_RSS  , &rlimit);
#endif	/* RLIMIT_RSS */
#ifdef	RLIMIT_VMEM
	(void)setrlimit(RLIMIT_VMEM  , &rlimit);
#endif	/* RLIMIT_VMEM */
	}
#endif	/* not _CRAY */
#endif	/* DEBUG */

	/* 1. set up to catch or ignore various signals */

	sigemptyset(&act.sa_mask);
	act.sa_flags   = 0;
	act.sa_handler = change_logs;
	if (sigaction( SIGHUP, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for HUP");
		return (2);
	}
	act.sa_handler = stop_me;
	if (sigaction( SIGINT, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for INT");
		return (2);
	}
	if (sigaction( SIGTERM, &act, &oact) != 0) {
		log_err(errno, id, "sigactin for TERM");
		return (2);
	}
#ifdef NDEBUG
	if (sigaction( SIGQUIT, &act, &oact) != 0) {
		log_err(errno, id, "sigactin for QUIT");
		return (2);
	}
#endif	/* NDEBUG */
#ifdef SIGSHUTDN
	if (sigaction( SIGSHUTDN, &act, &oact) != 0) {
		log_err(errno, id, "sigactin for SHUTDN");
		return (2);
	}
#endif

	act.sa_handler = catch_child;
	if (sigaction( SIGCHLD, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for CHLD");
		return (2);
	}

	act.sa_handler = SIG_IGN;
	if (sigaction( SIGPIPE, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for PIPE");
		return (2);
	}
	if (sigaction( SIGUSR1, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for USR1");
		return (2);
	}
	if (sigaction( SIGUSR2, &act, &oact) != 0) {
		log_err(errno, id, "sigaction for USR2");
		return (2);
	}

	/* 2. set up the various paths and other global variables we need */

	path_priv       = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);
	path_spool      = build_path(path_home, PBS_SPOOLDIR, suffix_slash);
	path_queues     = build_path(path_priv, PBS_QUEDIR, suffix_slash);
	path_jobs       = build_path(path_priv, PBS_JOBDIR, suffix_slash);
	path_acct	= build_path(path_priv, PBS_ACCT, suffix_slash);
	path_svrdb      = build_path(path_priv, PBS_SERVERDB, NULL);
	path_svrdb_new  = build_path(path_priv, PBS_SERVERDB, new_tag);
	path_track	= build_path(path_priv, PBS_TRACKING, NULL);
	path_nodes	= build_path(path_priv, NODE_DESCRIP, NULL);
	path_nodes_new	= build_path(path_priv, NODE_DESCRIP, new_tag);
	path_nodestate	= build_path(path_priv, NODE_STATUS, NULL);

#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
	rc  = chk_file_sec(path_jobs,   1, 0, S_IWGRP|S_IWOTH, 1);
	rc |= chk_file_sec(path_queues, 1, 0, S_IWGRP|S_IWOTH, 0);
	rc |= chk_file_sec(path_spool,  1, 1, S_IWOTH, 0);
	rc |= chk_file_sec(path_acct,	1, 1, S_IWGRP|S_IWOTH, 0);
	rc |= chk_file_sec(PBS_ENVIRON, 0, 0, S_IWGRP|S_IWOTH, 1);
	if (rc) 
		return (3);
#endif	/* not DEBUG and not NO_SECURITY_CHECK */
	
	CLEAR_HEAD(svr_requests);
	CLEAR_HEAD(task_list_immed);
	CLEAR_HEAD(task_list_timed);
	CLEAR_HEAD(task_list_event);
	CLEAR_HEAD(svr_queues);
	CLEAR_HEAD(svr_alljobs);
	CLEAR_HEAD(svr_newjobs);

	time_now = time((time_t *)0);

	/* 3. Set default server attibutes values */

	if (server.sv_attr[(int)SRV_ATR_scheduling].at_flags & ATR_VFLAG_SET)
		a_opt = server.sv_attr[(int)SRV_ATR_scheduling].at_val.at_long;

	for (i=0; i<SRV_ATR_LAST; i++)
		clear_attr(&server.sv_attr[i], &svr_attr_def[i]);

	server.sv_attr[(int)SRV_ATR_schedule_iteration].at_val.at_long = 
							PBS_SCHEDULE_CYCLE;
	server.sv_attr[(int)SRV_ATR_schedule_iteration].at_flags =ATR_VFLAG_SET;

	server.sv_attr[(int)SRV_ATR_State].at_val.at_long = SV_STATE_INIT;
	server.sv_attr[(int)SRV_ATR_State].at_flags = ATR_VFLAG_SET;

	(void)svr_attr_def[(int)SRV_ATR_mailfrom].at_decode(
			&server.sv_attr[(int)SRV_ATR_mailfrom], 0, 0,
			PBS_DEFAULT_MAIL);

	/* 4. force logging of all types */

	server.sv_attr[(int)SRV_ATR_log_events].at_val.at_long = PBSEVENT_MASK;
	server.sv_attr[(int)SRV_ATR_log_events].at_flags = ATR_VFLAG_SET;

	/* 5. If not a "create" initialization, recover server db */

	rc = chk_save_file(path_svrdb);
	if ( type != RECOV_CREATE) {

		/* Open the server database (save file) and read it in */
	
		if ((rc != 0) || ((rc =svr_recov(path_svrdb)) == -1)) {
			log_err(rc, "pbsd_init", msg_init_baddb);
			return (-1);
		}
		if (server.sv_attr[(int)SRV_ATR_resource_assn].at_flags &
					ATR_VFLAG_SET) {
		     svr_attr_def[(int)SRV_ATR_resource_assn].at_free(
			    &server.sv_attr[(int)SRV_ATR_resource_assn]);  
		}
	} else {
		if (rc == 0) {		/* path_svrdb exists */
			need_y_response(type);
			rm_files(path_priv);
			svr_save(&server, SVR_SAVE_FULL);
		}
	}

	(void)svr_attr_def[(int)SRV_ATR_version].at_decode(
			&server.sv_attr[(int)SRV_ATR_version], 0, 0,
			PBS_VERSION);

	/* 6. open accounting file */

	if (acct_open(acct_file) != 0)
		return (-1);

	/* 7. Set up other server and global variables */

	if (a_opt != -1) {
		/* a_option was set, overrides saved value of scheduling attr */

		server.sv_attr[(int)SRV_ATR_scheduling].at_val.at_long = a_opt;
		server.sv_attr[(int)SRV_ATR_scheduling].at_flags |=
								ATR_VFLAG_SET;
	}

	/* Open and read in node list if one exists */
 

	if ((rc = setup_nodes()) == -1) {
		/* log_buffer set in setup_nodes */
		log_err(-1, "pbsd_init(setup_nodes)", log_buffer);
		return (-1);
	}
 

	/*
	 * 8. If not a "create" initialization, recover queues.
	 *    If a create, remove any queues that might be there.
	 */

	if (chdir(path_queues) != 0) {
		(void)sprintf(log_buffer, msg_init_chdir, path_queues);
		log_err(errno,"pbsd_init",log_buffer);
		return (-1);
	}

	had = server.sv_qs.sv_numque;
	server.sv_qs.sv_numque = 0;
	dir = opendir(".");
	if (dir == (DIR *)0) {
		log_err(-1, "pbsd_init", msg_init_noqueues);
		return (-1);
	}
	while ((pdirent = readdir(dir)) != (struct dirent *)0) {
	    if ( chk_save_file(pdirent->d_name) == 0) {

		/* recover queue */

		if ((pque = que_recov(pdirent->d_name))!=(pbs_queue *)0) {
			/* que_recov increments sv_numque */
			sprintf(log_buffer, msg_init_recovque,
			        pque->qu_qs.qu_name);
		    	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN |
				  PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER,
				  msg_daemonname, log_buffer);
			if (pque->qu_attr[(int)QE_ATR_ResourceAssn].at_flags &
					ATR_VFLAG_SET) {
			     que_attr_def[(int)QE_ATR_ResourceAssn].at_free(
				    &pque->qu_attr[(int)QE_ATR_ResourceAssn]);  
			}
		}
	    }
	}
	(void)closedir(dir);
		
	if ((had != server.sv_qs.sv_numque) && (type != RECOV_CREATE))
	    	logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM;
	else
	    	logtype = PBSEVENT_SYSTEM;
	sprintf(log_buffer,msg_init_expctq,had,server.sv_qs.sv_numque);
	log_event(logtype, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer);

	/*
	 * 9. If not "create" or "clean" recovery, recover the jobs.
	 *    If a a create or clean recovery, delete any jobs.
	 */


	if (chdir(path_jobs) != 0) {
		(void)sprintf(log_buffer, msg_init_chdir, path_jobs);
		log_err(errno, "pbsd_init", log_buffer);
		return (-1);
	}
		
	had = server.sv_qs.sv_numjobs;
	server.sv_qs.sv_numjobs = 0;
	dir = opendir(".");
	if (dir == (DIR *)0) {
		if ((type != RECOV_CREATE) && (type != RECOV_COLD)) {
			if (had == 0) {
				log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER,
					  msg_daemonname, msg_init_nojobs);
			} else {
				sprintf(log_buffer, msg_init_exptjobs, had, 0);
				log_err(-1, "pbsd_init", log_buffer);
			}
		}
	} else {
		/* Now, for each job found ... */
	
		while ((pdirent = readdir(dir)) != (struct dirent *)0) {
	
		   if (chk_save_file(pdirent->d_name) == 0) {

			    /* recover the jobs */

			    baselen = strlen(pdirent->d_name) - job_suf_len;
			    psuffix = pdirent->d_name + baselen;
			    if (strcmp(psuffix, job_suffix))
				continue;
			    if ((pjob=job_recov(pdirent->d_name)) != NULL) {
				pbsd_init_job(pjob, type);
				if ((type != RECOV_COLD) &&
				    (type != RECOV_CREATE) &&
				    (pjob->ji_qs.ji_svrflags&JOB_SVFLG_SCRIPT)){
				    (void)strcpy(basen, pdirent->d_name);
				    psuffix = basen + baselen;
				    (void)strcpy(psuffix, JOB_SCRIPT_SUFFIX);
				    if (chk_save_file(basen) != 0) {
					log_event(PBSEVENT_ERROR  |
						  PBSEVENT_SYSTEM |
						  PBSEVENT_ADMIN  |
						  PBSEVENT_JOB    |
						  PBSEVENT_FORCE, 
						  PBS_EVENTCLASS_JOB,
						  pjob->ji_qs.ji_jobid,
						  msg_script_open);
					init_abt_job(pjob);
				    }
				}
			    } else {
				(void)sprintf(log_buffer, msg_init_badjob,
					      pdirent->d_name);
				log_err(-1, "pbsd_init", log_buffer);
				(void)strcpy(basen, pdirent->d_name);
				psuffix = basen + baselen;
				(void)strcpy(psuffix, JOB_BAD_SUFFIX);
				(void)link(pdirent->d_name, basen);
				(void)unlink(pdirent->d_name);
			    }
		    }
		}
		(void)closedir(dir);
		if ((had != server.sv_qs.sv_numjobs) && 
		    (type != RECOV_CREATE) &&
		    (type != RECOV_COLD))
			logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM;
		else
			logtype = PBSEVENT_SYSTEM;
		sprintf(log_buffer, msg_init_exptjobs, had,
			server.sv_qs.sv_numjobs);
		log_event(logtype, PBS_EVENTCLASS_SERVER,
			  msg_daemonname, log_buffer);
	}

	/* If queue_rank has gone negative, renumber all jobs and reset rank */

	if (queue_rank < 0) {
		queue_rank = 0;
		pjob = (job *)GET_NEXT(svr_alljobs);
		while (pjob) {
			pjob->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = 
						++queue_rank;
			(void)job_save(pjob, SAVEJOB_FULL);
			pjob = (job *)GET_NEXT(pjob->ji_alljobs);
		}
	}

	/* Put us back in the Server's Private directory */

	if (chdir(path_priv) != 0 ) {
		(void)sprintf(log_buffer, msg_init_chdir, path_priv);
		log_err(-1, id, log_buffer);
		return (3);
	}

	/* 10. Open and read in tracking records */

	fd = open(path_track, O_RDONLY | O_CREAT, 0600);
	if (fd < 0) {
		log_err(errno, "pbsd_init", "unable to open tracking file");
		return (-1);
	}
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
	if (chk_file_sec(path_track,  0, 0, S_IWGRP|S_IWOTH, 0) != 0)
		return (-1);
#endif  /* not DEBUG and not NO_SECURITY_CHECK */

	if (fstat(fd, &statbuf) < 0) {
		log_err(errno, "pbs_init", "unable to stat tracking file");
		return (-1);
	} else {

		i = (statbuf.st_size + sizeof (struct tracking) - 1) /
		    sizeof (struct tracking);
		if (i < PBS_TRACK_MINSIZE)
			server.sv_tracksize = PBS_TRACK_MINSIZE;
		else 
			server.sv_tracksize = i;
	}
	server.sv_track = (struct tracking *)calloc(server.sv_tracksize,
						    sizeof(struct tracking));
	for (i = 0; i < server.sv_tracksize; i++)
		(server.sv_track + i)->tk_mtime = 0;

	(void)read(fd, (char *)server.sv_track,
		   server.sv_tracksize * sizeof (struct tracking));
	(void)close(fd);
	server.sv_trackmodifed = 0;

	/* set work task to periodically save the tracking records */

	(void)set_task(WORK_Timed, (long)(time_now + PBS_SAVE_TRACK_TM),
		       track_save, 0);

	return (0);
}

/*
 * build_path - build the pathname for a PBS directory
 */

static char *build_path(parent, name, sufix)
	char *parent;		/* parent directory name (dirname) */
	char *name;		/* sub directory name */
	char *sufix;		/* suffix string to append */
{
	int   prefixslash;
	char *ppath;
	size_t len;

	/*
	 * allocate space for the names + maybe a slash between + the suffix
	 */

	if (*(parent+strlen(parent)-1)  == '/')
		prefixslash = 0;
	else
		prefixslash = 1;

	len = strlen(parent) + strlen(name) + prefixslash + 1;
	if (sufix)
		len += strlen(sufix);
	ppath = malloc(len);
	if (ppath) {
		(void)strcpy(ppath, parent);
		if (prefixslash)
			(void)strcat(ppath, "/");
		(void)strcat(ppath, name);
		if (sufix) 
			(void)strcat(ppath, sufix);
		return (ppath);
	} else {
		log_err(errno, "build_path", msg_err_malloc);
		log_close(1);
		exit(3);
	}
/*NOTREACHED*/
}

/*
 * pbsd_init_job - decide what to do with the recovered job structure
 *
 *	The action depends on the type of initialization.
 */
static void pbsd_init_job(pjob, type)
	job *pjob;
	int  type;
{
	unsigned int d;

	pjob->ji_momhandle = -1;

	/* update at_server attribute in case name changed */

	job_attr_def[(int)JOB_ATR_at_server].at_free(
			&pjob->ji_wattr[(int)JOB_ATR_at_server]);
	job_attr_def[(int)JOB_ATR_at_server].at_decode(
			&pjob->ji_wattr[(int)JOB_ATR_at_server],
			(char *)0, (char *)0, server_name);

	/* update queue_rank if this job is higher than current */

	if ((unsigned long)pjob->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long >
					(unsigned long)queue_rank)
		queue_rank = pjob->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long;


	/* now based on the initialization type */

	if ((type == RECOV_COLD) || (type == RECOV_CREATE)) {
		need_y_response(type);
		init_abt_job(pjob);
	
	} else {

		if (type != RECOV_HOT)
			pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART;

		switch (pjob->ji_qs.ji_substate) {

		    case JOB_SUBSTATE_TRANSICM:
			if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) {

				/*
				 * This server created the job, so client 
				 * was qsub (a transient client), it won't be
				 * arround to recommit, so auto-commit now
				 */

				pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
				pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;
				pbsd_init_reque(pjob, CHANGE_STATE);
			} else {
				/*
				 * another server is sending, append to new job
				 * list and wait for commit; need to clear 
				 * receiving sock number though
				 */
				pjob->ji_qs.ji_un.ji_newt.ji_fromsock = -1;
				append_link(&svr_newjobs,
					    &pjob->ji_alljobs, pjob);

			}
			break;
			
		    case JOB_SUBSTATE_TRNOUT:
			pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;
			/* requeue as queued */
			pbsd_init_reque(pjob, CHANGE_STATE);
			break;

		    case JOB_SUBSTATE_TRNOUTCM:
			/* requeue as is - rdy to cmt */
			pbsd_init_reque(pjob, KEEP_STATE);	

			/* resend rtc */
			set_task(WORK_Immed, 0, resume_net_move, (void *)pjob);
			break;
		
		    case JOB_SUBSTATE_QUEUED:
		    case JOB_SUBSTATE_PRESTAGEIN:
		    case JOB_SUBSTATE_STAGEIN:
		    case JOB_SUBSTATE_STAGECMP:
		    case JOB_SUBSTATE_STAGEFAIL:
		    case JOB_SUBSTATE_STAGEGO:
		    case JOB_SUBSTATE_HELD:
		    case JOB_SUBSTATE_SYNCHOLD:
		    case JOB_SUBSTATE_DEPNHOLD:
		    case JOB_SUBSTATE_WAITING:
		    case JOB_SUBSTATE_PRERUN:
			pbsd_init_reque(pjob, CHANGE_STATE);
			break;

		    case JOB_SUBSTATE_RUNNING:
			pbsd_init_reque(pjob, KEEP_STATE);
			set_resc_assigned(pjob, INCR);
			set_old_nodes(pjob);
			if (type == RECOV_HOT)
				pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART;
			break;

			
		    case JOB_SUBSTATE_SYNCRES:

			/* clear all dependent job ready flags */

			depend_clrrdy(pjob);
			pbsd_init_reque(pjob, CHANGE_STATE);
			break;

		    case JOB_SUBSTATE_EXITING:
		    case JOB_SUBSTATE_STAGEOUT:
		    case JOB_SUBSTATE_STAGEDEL:
		    case JOB_SUBSTATE_EXITED:
		    case JOB_SUBSTATE_ABORT:
			set_task(WORK_Immed, 0, on_job_exit, (void *)pjob);
			pbsd_init_reque(pjob, KEEP_STATE);
			break;

		    case JOB_SUBSTATE_RERUN:
			if (pjob->ji_qs.ji_state == JOB_STATE_EXITING)
			    set_task(WORK_Immed, 0, on_job_rerun, (void *)pjob);
			pbsd_init_reque(pjob, KEEP_STATE);
			break;

		    case JOB_SUBSTATE_RERUN1:
		    case JOB_SUBSTATE_RERUN2:
			set_task(WORK_Immed, 0, on_job_rerun, (void *)pjob);
			pbsd_init_reque(pjob, KEEP_STATE);
			break;

		    default:
			(void)sprintf(log_buffer,
			       msg_init_unkstate, pjob->ji_qs.ji_substate);
			log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
				  pjob->ji_qs.ji_jobid, log_buffer);
			job_abt(pjob, log_buffer);
			break;
		}

		/* if job has IP address of Mom, it may have changed */
		/* reset based on hostname                           */
 
		if ((pjob->ji_qs.ji_un_type == JOB_UNION_TYPE_EXEC) &&
		    (pjob->ji_qs.ji_un.ji_exect.ji_momaddr != 0)) {
			if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags &
				    ATR_VFLAG_SET) { 
				pjob->ji_qs.ji_un.ji_exect.ji_momaddr = get_hostaddr(parse_servername(pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str,&d));
			} else {
				pjob->ji_qs.ji_un.ji_exect.ji_momaddr = 0;
			}
		}
	}
}

			
static void pbsd_init_reque(pjob, change_state)
	job *pjob;
	int  change_state;
{
	char logbuf[265];
	int newstate;
	int newsubstate;

	(void)sprintf(logbuf, msg_init_substate,
		      pjob->ji_qs.ji_substate);

	/* re-enqueue the job into the queue it was in */

	if (change_state) {
		/* update the state, typically to some form of QUEUED */
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		set_statechar(pjob);
	}

	if (svr_enquejob(pjob) == 0) {
		(void)strcat(logbuf, msg_init_queued);
		(void)strcat(logbuf, pjob->ji_qs.ji_queue);
		log_event(PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
			  PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, logbuf);
	} else {

		/* Oops, this should never happen */

		sprintf(logbuf, "%s; job %s queue %s", msg_err_noqueue,
			pjob->ji_qs.ji_jobid, pjob->ji_qs.ji_queue);
		log_err(-1, "pbsd_init", logbuf);
		(void)job_abt(pjob, logbuf);
	}
}

/*
 * catch_child() - the signal handler for  SIGCHLD.
 *
 * Collect child status and add to work list entry for that child.
 * The list entry is marked as immediate to show the child is gone and
 * svr_delay_entry is incremented to indicate to next_task() to check for it.
 */

static void catch_child(sig)
	int sig;
{
	struct work_task *ptask;
	pid_t		  pid;
	int		  statloc;
	
	while (1) {
	    if (((pid = waitpid(-1, &statloc, WNOHANG)) == -1) &&
		 (errno != EINTR)) {
			return;
	    } else if (pid == 0) {
			return;
	    }
	    ptask = (struct work_task *)GET_NEXT(task_list_event);
            while (ptask) {
                if ((ptask->wt_type == WORK_Deferred_Child) &&
		    (ptask->wt_event == pid)) {
			ptask->wt_type = WORK_Deferred_Cmp;
			ptask->wt_aux = (int)statloc;	/* exit status */
			svr_delay_entry++;	/* see next_task() */
		}
                ptask = (struct work_task *)GET_NEXT(ptask->wt_linkall);
	    }
        }
}

/*
 * changs_logs - signal handler for SIGHUP
 *	Causes the accounting file and log file to be closed and reopened.
 *	Thus the old one can be renamed.
 */

static void change_logs(sig)
	int sig;
{
	acct_close();
	log_close(1);
	log_open(log_file, path_log);
	(void)acct_open(acct_file);
	rpp_dbprt = 1 - rpp_dbprt;	/* toggle debug prints for RPP */
}

/* 
 * stop_me - signal handler for all caught signals which terminate the server
 *
 *	Record the signal so an log_event call can be made outside of
 *	the handler, and set the server state to indicate we should shut down.
 */

/*ARGSUSED*/
static void stop_me(sig)
	int sig;
{
	server.sv_attr[(int)SRV_ATR_State].at_val.at_long = SV_STATE_SHUTSIG;
}

static int chk_save_file(filename)
	char *filename;
{
	struct stat sb;

	if (stat(filename, &sb) == -1)
		return (errno);

	if (S_ISREG(sb.st_mode))
		return (0);
	return (-1);
}

/*
 * resume_net_move - call net_move() to complete the routing of a job
 *	This is invoked via a work task created on recovery of a job
 *	in JOB_SUBSTATE_TRNOUTCM state.
 */

static void resume_net_move(ptask)
	struct work_task *ptask;
{
	net_move((job *)ptask->wt_parm1, 0);
}

/*
 * need_y_response - on create/clean initialization that would delete
 *	information, obtain the operator approval first.
 */

static void need_y_response(type)
	int type;
{
	static int answ = -2;
	int c;

	if (answ > 0)
		return;		/* already gotten a response */

	fflush(stdin);
	if (type == RECOV_CREATE)
		printf(msg_startup3, msg_daemonname, server_name, "Create","server database");
	else
		printf(msg_startup3, msg_daemonname, server_name, "Cold", "jobs");
	while (1) {
		answ = getchar();
		c    = answ;
		while ((c != '\n') && (c != EOF))
			c = getchar();
		switch (answ) {
		    case 'y':
		    case 'Y':
			return;
	
		    case  EOF:
		    case '\n':
		    case 'n':
		    case 'N':
			printf("PBS server %s initialization aborted\n", server_name);
			exit(0);
		}
		printf("y(es) or n(o) please:\n");
	}
}

/*
 * rm_files - on an RECOV_CREATE, remove all files under the specified
 *	directory (path_priv) and any subdirectory except under "jobs".
 */

static void rm_files(dirname)
	char *dirname;
{
	DIR *dir;
	int  i;
	struct stat    stb;
	struct dirent *pdirt;
	char path[1024];

	/* list of directories in which files are removed */
	static char *byebye[] = {
		"acl_groups",
		"acl_hosts",
		"acl_svr",
		"acl_users",
		"queues",
		(char *)0		/* keep as last entry */
	};

	dir = opendir(dirname);
	if (dir) {
		while (pdirt = readdir(dir)) {
			(void)strcpy(path, dirname);
			(void)strcat(path, "/");
			(void)strcat(path, pdirt->d_name);
			if (stat(path, &stb) == 0) {
				if (S_ISDIR(stb.st_mode)) {
					for (i=0; byebye[i]; ++i) {
						if (strcmp(pdirt->d_name, byebye[i]) == 0) {
							rm_files(path);
				    		}
					}
				} else if (unlink(path) == -1) {
					(void)strcpy(log_buffer,"cant unlink");
					(void)strcat(log_buffer, path);
					log_err(errno, "pbsd_init", log_buffer);
				}
			}
		}
	}
}

/*
 * init_abt_job() - log and email owner message that job is being aborted at
 *	initialization; then purge job (must be called after job is enqueued.
 */

static void init_abt_job(pjob)
	job *pjob;
{
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN| PBSEVENT_DEBUG,
		  PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, msg_init_abt);
	svr_mailowner(pjob, MAIL_ABORT, MAIL_NORMAL, msg_init_abt);
	job_purge(pjob);
}
