/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
#include <pbs_config.h>   /* the master config generated by configure */

#include <sys/types.h>
#include <stdio.h>
#include <dirent.h>
#include <limits.h>
#include <assert.h>
#include <ctype.h>
#include <sys/stat.h>
#include "dis.h"
#include "libpbs.h"
#include "portability.h"
#include <errno.h>
#include <fcntl.h>
#include <pwd.h>
#include <signal.h>
#include <string.h>
#include "list_link.h"
#include "server_limits.h"
#include "attribute.h"
#include "resource.h"
#include "job.h"
#include "log.h"
#include "work_task.h"
#include "credential.h"
#include "batch_request.h"
#include "net_connect.h"
#include "svrfunc.h"
#include "mom_mach.h"
#include "mom_func.h"
#include "pbs_error.h"

static char ident[] = "@(#) $RCSfile: catch_child.c,v $ $Revision: 2.7 $";

/* External Functions */

/* External Globals */

extern char		*path_epilog;
extern char		*path_jobs;
extern unsigned int	default_server_port;
extern list_head	svr_alljobs;
extern int		exiting_tasks;
extern char		*msg_daemonname;
extern int		termin_child;
extern struct connection svr_conn[];
extern int		resc_access_perm;
extern char		*path_home;

static void obit_reply A_((int sock));

/*
 * catch_child() - the signal handler for  SIGCHLD.
 *
 * To keep the signal handler simple for
 *	SIGCHLD  - just indicate there was one.
 */

void catch_child(sig)
	int sig;
{
	termin_child = 1;
}

hnodent	*
get_node(pjob, nodeid)
    job		*pjob;
    tm_node_id	nodeid;
{
	int	i;
	vnodent	*vp = pjob->ji_vnods;

	for (i=0; i<pjob->ji_numvnod; i++, vp++) {
		if (vp->vn_node == nodeid)
			return vp->vn_host;
	}
	return NULL;
}

#if	MOM_CHECKPOINT == 1
/*
**	Restart each task which has exited and has TI_FLAGS_CHKPT turned on.
**	If all tasks have been restarted, turn off MOM_CHKPT_POST.
*/
void
chkpt_partial(pjob)
	job	*pjob;
{
	static char	id[] = "chkpt_partial";
	int		i;
	char		namebuf[MAXPATHLEN];
	char		*filnam;
	task		*ptask;
	int		texit = 0;
	extern	char	task_fmt[];
	extern	char	*path_checkpoint;

	assert(pjob != NULL);

	strcpy(namebuf, path_checkpoint);
	strcat(namebuf, pjob->ji_qs.ji_fileprefix);
	strcat(namebuf, JOB_CKPT_SUFFIX);

	i = strlen(namebuf);
	filnam = &namebuf[i];

	for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
			ptask != NULL;
			ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
		/*
		** See if the task was marked as one of those that did
		** actually checkpoint.
		*/
		if ((ptask->ti_flags & TI_FLAGS_CHKPT) == 0)
			continue;
		texit++;
		/*
		** Now see if it was reaped.  We don't want to
		** fool with it until we see it die.
		*/
		if (ptask->ti_qs.ti_status != TI_STATE_EXITED)
			continue;
		texit--;

		sprintf(filnam, task_fmt, ptask->ti_qs.ti_task);
		if (mach_restart(ptask, namebuf) == -1)
			goto fail;

		ptask->ti_qs.ti_status = TI_STATE_RUNNING;
		ptask->ti_flags &= ~TI_FLAGS_CHKPT;
		(void)task_save(ptask);
	}

	if (texit == 0) {
		char	oldname[MAXPATHLEN];
		struct	stat	statbuf;

		/*
		** All tasks should now be running.
		** Turn off MOM_CHKPT_POST flag so job is back to where
		** it was before the bad checkpoint attempt.
		*/
		pjob->ji_flags &= ~MOM_CHKPT_POST;
		/*
		** Get rid of incomplete checkpoint directory and
		** move old chkpt dir back to regular if it exists.
		*/
		*filnam = '\0';
		(void)remtree(namebuf);
		strcpy(oldname, namebuf);
		strcat(oldname, ".old");
		if (stat(oldname, &statbuf) == 0) {
			if (rename(oldname, namebuf) == -1)
				pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
		}
	}
	return;

 fail:
	sprintf(log_buffer, "%s failed to restart",
		pjob->ji_qs.ji_jobid);
	log_err(errno, id, log_buffer);
	pjob->ji_flags &= ~MOM_CHKPT_POST;
	(void)kill_job(pjob, SIGKILL);
	return;
}
#endif	/* MOM_CHECKPOINT */

void scan_for_exiting()
{
	static	char		noconnect[] =
	    "No contact with server at hostaddr %x, port %d, jobid %s errno %d";
	pid_t			cpid;
	int			found_one = 0;
	job			*nxjob;
	job			*pjob;
	task			*ptask;
	obitent			*pobit;
	struct	batch_request	*preq;
	int			sock;
	int			sock3;
	char			*svrport;
	char			*cookie;
	unsigned	int	port;
	u_long	gettime		A_((resource *pres));
	u_long	getsize		A_((resource *pres));
	task	*task_find	A_((	job		*pjob,
					tm_task_id	taskid));
	int	im_compose	A_((	int		stream,
					char		*jobid,
					char		*cookie,
					int		command,
					tm_event_t	event,
					tm_task_id	taskid));
	/*
	** Look through the jobs.  Each one has it's tasks examined
	** and if the job is EXITING, it meets it's fate depending
	** on whether this is the Mother Superior or not.
	*/
	for (pjob = (job *)GET_NEXT(svr_alljobs); pjob; pjob = nxjob) {
		nxjob = (job *)GET_NEXT(pjob->ji_alljobs);

#if	MOM_CHECKPOINT == 1
		/*
		** If a checkpoint with aborts is active,
		** skip it.  We don't want to report any obits
		** until we know that the whole thing worked.
		*/
		if (pjob->ji_flags & MOM_CHKPT_ACTIVE) {
			continue;
		}
		/*
		** If the job has had an error doing a checkpoint with
		** abort, the MOM_CHKPT_POST flag will be on.
		*/
		if (pjob->ji_flags & MOM_CHKPT_POST) {
			chkpt_partial(pjob);
			continue;
		}
#endif	/* MOM_CHECKPOINT */

		if (!(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags &
					ATR_VFLAG_SET))
			continue;
		cookie = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str;

		/*
		** Check each EXITED task.  They transistion to DEAD here.
		*/
		for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
				ptask != NULL;
				ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
			if (ptask->ti_qs.ti_status != TI_STATE_EXITED)
				continue;
			/*
			** Check if it is the top shell.
			*/
			if (ptask->ti_qs.ti_parenttask == TM_NULL_TASK) {
				pjob->ji_qs.ji_un.ji_momt.ji_exitstat =
					ptask->ti_qs.ti_exitstat;
				LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
					pjob->ji_qs.ji_jobid, "Terminated");
				if (send_sisters(pjob, IM_KILL_JOB) == 0) {
					pjob->ji_qs.ji_substate =
						JOB_SUBSTATE_EXITING;
				}
			}
			/*
			** Go through any TM client obits waiting.
			*/
			pobit = (obitent *)GET_NEXT(ptask->ti_obits);
			while (pobit) {
				hnodent	*pnode;

				pnode = get_node(pjob, pobit->oe_info.fe_node);

				/* see if this is me or another MOM */
				if (pjob->ji_nodeid == pnode->hn_node) {
					task		*tp;

					/*
					** Send event to local kid.
					*/
					tp = task_find(pjob,
						pobit->oe_info.fe_taskid);
					assert(tp != NULL);
					if (tp->ti_fd != -1) {
					    (void)tm_reply(tp->ti_fd,
						IM_ALL_OKAY,
						pobit->oe_info.fe_event);
					    (void)diswsi(tp->ti_fd,
						ptask->ti_qs.ti_exitstat);
					    (void)DIS_tcp_wflush(tp->ti_fd);
					}
				}
				else if (pnode->hn_stream != -1) {
					/*
					** Send a response over to MOM
					** whose brat sent the request.
					*/
					(void)im_compose(pnode->hn_stream,
						pjob->ji_qs.ji_jobid,
						cookie, IM_ALL_OKAY,
						pobit->oe_info.fe_event,
						pobit->oe_info.fe_taskid);
					(void)diswsi(pnode->hn_stream,
						ptask->ti_qs.ti_exitstat);
					(void)rpp_flush(pnode->hn_stream);
				}

				delete_link(&pobit->oe_next);
				free(pobit);
				pobit = (obitent *)GET_NEXT(ptask->ti_obits);
			}
			ptask->ti_fd = -1;
			ptask->ti_qs.ti_status = TI_STATE_DEAD;
			task_save(ptask);
		}

		/*
		** Look to see if the job has terminated.  If it is
		** in any state other than EXITING continue on.
		*/
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_EXITING)
			continue;

		/*
		** Look to see if I am a regular sister.  If so,
		** check to see if there is a obit event to
		** send back to mother superior.
		** Otherwise, I need to wait for her to send a KILL_JOB
		** so I can send the obit (unless she died).
		*/
		if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
			int	stream = (pjob->ji_hosts == NULL) ? -1 :
					pjob->ji_hosts[0].hn_stream;

			/*
			** Check to see if I'm still in touch with
			** the head office.  If not, I'm just going to
			** get rid of this job.
			*/
			if (stream == -1) {
				(void)kill_job(pjob, SIGKILL);
				job_purge(pjob);
				continue;
			}

			/*
			** No event waiting for sending info to MS
			** so I'll just sit tight.
			*/
			if (pjob->ji_obit == TM_NULL_EVENT)
				continue;

			/*
			** Check to see if any tasks are running.
			*/
			ptask = (task *)GET_NEXT(pjob->ji_tasks);
			while (ptask != NULL) {
				if (ptask->ti_qs.ti_status == TI_STATE_RUNNING)
					break;
				ptask = (task *)GET_NEXT(ptask->ti_jobtask);
			}
			/*
			** Still somebody there so don't send it yet.
			*/
			if (ptask != NULL)
				continue;
			/*
			** No tasks running ... format and send a
			** reply to the mother superior and get rid of
			** the job.
			*/
			(void)im_compose(stream, pjob->ji_qs.ji_jobid,
					cookie, IM_ALL_OKAY,
					pjob->ji_obit, TM_NULL_TASK);
			(void)diswul(stream,
					resc_used(pjob, "cput", gettime));
			(void)diswul(stream,
					resc_used(pjob, "mem", getsize));
			(void)rpp_flush(stream);
			job_purge(pjob);
			continue;
		}

		/*
		** At this point, we know we are Mother Superior for this
		** job which is EXITING.  Time for it to die.
		*/
		pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend;
		(void)kill_job(pjob, SIGKILL);
		delete_link(&pjob->ji_jobque);	/* unlink for poll list */

		/*
		 * +  Open connection to the Server (for the Job Obituary)
		 * +  Set the connection to call obit_reply when the reply 
		 *    arrives.
		 * +  fork child process, parent looks for more terminated jobs.
		 * Child:
		 * +  Run the epilogue script (if one)
		 * +  Send the Job Obit Request (notice).
		 */

		svrport = strchr(pjob->ji_wattr[(int)JOB_ATR_at_server].
				at_val.at_str, (int)':');
		if (svrport)
/*bmann			port = atoi(svrport+1) + 1; */
			port = atoi(svrport+1); 
		else
			port = default_server_port;
		sock = client_to_svr(pjob->ji_qs.ji_un.ji_momt.ji_svraddr,
				port, 1);
		if (sock < 0) {
			(void)sprintf(log_buffer, noconnect,
					pjob->ji_qs.ji_un.ji_momt.ji_svraddr,
					port,
					pjob->ji_qs.ji_jobid, errno);
			LOG_EVENT(PBSEVENT_DEBUG,PBS_EVENTCLASS_REQUEST,
				  "jobobit", log_buffer);
			/*
			 * return (break out of loop), leave exiting_tasks set
			 * so Mom will retry Obit when server is available
			 */
			return;
		} else if (sock < 3) {
			/* needs to be 3 or above for epilogue */
			sock3 = fcntl(sock, F_DUPFD, 3);
			(void)close(sock);
		} else
			sock3 = sock;
			
		pjob->ji_momhandle = sock3;
		add_conn(sock3, ToServerDIS,
			 pjob->ji_qs.ji_un.ji_momt.ji_svraddr,
			 port, obit_reply);

		cpid = fork_me(sock3);
		if (cpid > 0) {
			/* parent = mark that it is being sent */
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_OBIT;
			if (found_one++ == 0) {
				continue;	/* look for one more */
			} else {
				break;	/* two at a time is our limit */
			}
		} else if (cpid < 0)
			continue;

		/* child: change to the user's home directory and 	*/
		/* run the epilogue script				*/

		(void)chdir(pjob->ji_grpcache->gc_homedir);

		if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags &
				ATR_VFLAG_SET) &&
		    pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long) {
			(void)run_pelog(PE_EPILOGUE, path_epilog, pjob,
					PE_IO_TYPE_NULL);
		} else {
			(void)run_pelog(PE_EPILOGUE, path_epilog, pjob,
					PE_IO_TYPE_STD);
		}

		/* Get rid of HOSTFILE if any */
		if (pjob->ji_flags & MOM_HAS_NODEFILE) {
			char	file[MAXPATHLEN+1];

			(void)sprintf(file, "%s/aux/%s",
				path_home, pjob->ji_qs.ji_jobid);
			(void)unlink(file);
			pjob->ji_flags &= ~MOM_HAS_NODEFILE;
		}
		

		/* Send the Job Obiturary Notice to the server */

		preq = alloc_br(PBS_BATCH_JobObit);
		(void)strcpy(preq->rq_ind.rq_jobobit.rq_jid,
			     pjob->ji_qs.ji_jobid);
		preq->rq_ind.rq_jobobit.rq_status = 
			     pjob->ji_qs.ji_un.ji_momt.ji_exitstat;
		CLEAR_HEAD(preq->rq_ind.rq_jobobit.rq_attr);

		resc_access_perm = ATR_DFLAG_RDACC;
		encode_used(pjob, &preq->rq_ind.rq_jobobit.rq_attr);

		DIS_tcp_setup(sock3);
		(void)encode_DIS_ReqHdr(sock3,PBS_BATCH_JobObit,
					pbs_current_user);
		(void)encode_DIS_JobObit(sock3, preq);
		(void)encode_DIS_ReqExtend(sock3, 0);
		(void)DIS_tcp_wflush(sock3);
		(void)close(sock3);
		log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
			   pjob->ji_qs.ji_jobid, "Obit sent");
		exit(0);
	}
	if (pjob == 0) exiting_tasks = 0; /* went through all jobs */
}

/*
 * obit_reply - read and process the reply from the server acknowledging
 *	the job obiturary notice.
 */

static void obit_reply(sock)
	int sock;
{
	int			irtn;
	job			*nxjob;
	job			*pjob;
	attribute		*pattr;
	struct batch_request	*preq;
	int			 x;	/* dummy */

	/* read and decode the reply */

	preq = alloc_br(PBS_BATCH_JobObit);
	CLEAR_HEAD(preq->rq_ind.rq_jobobit.rq_attr);
	while ((irtn=DIS_reply_read(sock, &preq->rq_reply))&&(errno == EINTR));
	if ( irtn != 0 ) {
		(void)sprintf(log_buffer,
			"DIS_reply_read failed, rc=%d sock=%d",
			irtn, sock);
		log_err(errno, "obit_reply", log_buffer);
		preq->rq_reply.brp_code = -1;
	}
	 
	/* find the job associated with the reply by the socket number	*/
	/* saved in the job structure, ji_momhandle 			*/

	pjob = (job *)GET_NEXT(svr_alljobs);
	while (pjob) {
	    nxjob = (job *)GET_NEXT(pjob->ji_alljobs);
	    if ( (pjob->ji_qs.ji_substate == JOB_SUBSTATE_OBIT) &&
		 (pjob->ji_momhandle == sock) ) {

		    switch (preq->rq_reply.brp_code) {

			case PBSE_NONE:
			    /* normal ack, mark job as exited	*/
			    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITED;
			    job_save(pjob, SAVEJOB_QUICK);
			    break;

			case PBSE_ALRDYEXIT:
			    /* have already told the server before recovery */
			    /* the server will contact us to continue       */
			    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITED;
			    job_save(pjob, SAVEJOB_QUICK);
			    break;

			case PBSE_CLEANEDOUT:
			    /* all jobs discarded by server, discard job */
			    pattr = &pjob->ji_wattr[(int)JOB_ATR_interactive];
			    if ( ((pattr->at_flags & ATR_VFLAG_SET) == 0) ||
				 (pattr->at_val.at_long == 0) ) {
				    /* do this if not interactive */
			            (void)unlink(std_file_name(pjob,StdOut,&x));
			            (void)unlink(std_file_name(pjob,StdErr,&x));
			            (void)unlink(std_file_name(pjob,Chkpt,&x));
			    }
			    mom_deljob(pjob);
			    break;

			case -1:
			    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
			    exiting_tasks = 1;
			    break;
			default:
			    LOG_EVENT(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,
					pjob->ji_qs.ji_jobid,
					"Server rejected Job Obit");
			    mom_deljob(pjob);
			    break;
		    }
		    break;
	    }
	    pjob = nxjob;
	}
	if (pjob == 0) {
		LOG_EVENT(PBSEVENT_ERROR,PBS_EVENTCLASS_REQUEST, "obit reply",
			  "Job not found for obit reply");
	}
	free_br(preq);
	shutdown(sock, 2);
	close_conn(sock);
}


/*
 * init_abort_jobs - on mom initialization, recover all running jobs.
 *
 *	Called on initialization
 *	   If the -p option was given (recover = 2), Mom will allow the jobs
 *	   to continue to run.   She depends on detecting when they terminate
 *	   via the slow poll method rather than SIGCHLD.
 *
 *	   If the -r option was given (recover = 1), MOM is recovering on a
 *  	   running system and the session id of the jobs should be valid;
 *	   the jobs are killed.
 *
 *	   If -r was not given (recover = 0), it is assumed that the whole 
 *	   system, not just MOM, is comming up, the session ids are not valid;
 *	   so no attempt is made to kill the job processes.  But the jobs are
 *	   terminated and requeued.
 */

void init_abort_jobs(recover)
	int	 recover;
{
	DIR		*dir;
	int		i, sisters;
	struct dirent	*pdirent;
	job		*pj;
	char		*job_suffix = JOB_FILE_SUFFIX;
	int		job_suf_len = strlen(job_suffix);
	char		*psuffix;
#if	MOM_CHECKPOINT == 1
	char		path[MAXPATHLEN+1];
	char		oldp[MAXPATHLEN+1];
	struct	stat	statbuf;
	extern	char	*path_checkpoint;
#endif

	dir = opendir(path_jobs);
	if (dir == (DIR *)0) {
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, 
				msg_daemonname, "Jobs directory not found");
		exit(1);
	}
	while ((pdirent = readdir(dir)) != (struct dirent *)0) {
		if ((i = strlen(pdirent->d_name)) <= job_suf_len)
			continue;

		psuffix = pdirent->d_name + i - job_suf_len;
		if (strcmp(psuffix, job_suffix))
			continue;
		pj = job_recov(pdirent->d_name);
		if (pj == NULL)
			continue;

		append_link(&svr_alljobs, &pj->ji_alljobs, pj);
		job_nodes(pj);
		task_recov(pj);

#if	MOM_CHECKPOINT == 1
		/*
		** Check to see if a checkpoint.old dir exists.
		** If so, remove the regular checkpoint dir
		** and rename the old to the regular name.
		*/
		strcpy(path, path_checkpoint);
		strcat(path, pj->ji_qs.ji_fileprefix);
		strcat(path, JOB_CKPT_SUFFIX);
		strcpy(oldp, path);
		strcat(oldp, ".old");

		if (stat(oldp, &statbuf) == 0) {
			(void)remtree(path);
			if (rename(oldp, path) == -1)
				(void)remtree(oldp);
		}
#endif
		if ( (recover != 2) &&
		     ((pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) ||
		      (pj->ji_qs.ji_substate == JOB_SUBSTATE_SUSPEND) ||
		      (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)) )  {

			if (recover)
				(void)kill_job(pj, SIGKILL);

			/*
			** Check to see if I am Mother Superior.  The
			** JOB_SVFLG_HERE flag is overloaded for MOM
			** for this purpose.
			** If I'm an ordinary sister, just throw the job
			** away.  If I am MS, send a KILL_JOB request to
			** any sisters that happen to still be alive.
			*/
			if ((pj->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				mom_deljob(pj);
				continue;
			}

			/* set exit status to:
			 *   JOB_EXEC_INITABT - init abort and no chkpnt
			 *   JOB_EXEC_INITRST - init and chkpt, no mig
			 *   JOB_EXEC_INITRMG - init and chkpt, migrate
			 * to indicate recovery abort
			 */
			if (pj->ji_qs.ji_svrflags & 
					(JOB_SVFLG_CHKPT |
					 JOB_SVFLG_ChkptMig)) {
#if PBS_CHKPT_MIGRATE
				pj->ji_qs.ji_un.ji_momt.ji_exitstat = 
							JOB_EXEC_INITRMG;
#else
				pj->ji_qs.ji_un.ji_momt.ji_exitstat = 
							JOB_EXEC_INITRST;
#endif
			} else {
				pj->ji_qs.ji_un.ji_momt.ji_exitstat =
						JOB_EXEC_INITABT;
			}

			sisters = pj->ji_numnodes - 1;
			/*
			** A sisterhood exists... send a KILL request.
			*/
			if (sisters > 0) {
				pj->ji_resources = (noderes *)calloc(sisters,
						sizeof(noderes));
				(void)send_sisters(pj, IM_KILL_JOB);
				continue;
			}
			pj->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
			job_save(pj, SAVEJOB_QUICK);
			exiting_tasks = 1;
		}
	}
	(void)closedir(dir);
}

/* 
 * mom_deljob - delete the job entry, MOM no longer knows about the job
 */

void mom_deljob(pjob)
	job *pjob;
{
#ifdef _CRAY
	/* remove any temporary directories */
	rmtmpdir(pjob->ji_qs.ji_jobid);
#endif	/* _CRAY */
	job_purge(pjob);
}
