/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
#include <pbs_config.h>   /* the master config generated by configure */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pwd.h>
#include <grp.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include <signal.h>
#include <termios.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
#if IBM_SP2==2	/* IBM SP with PSSP 3.1 */
#include <st_client.h>
#endif				/* IBM SP */

#include "libpbs.h"
#include "portability.h"
#include "list_link.h"
#include "server_limits.h"
#include "attribute.h"
#include "resource.h"
#include "job.h"
#include "log.h"
#include "rpp.h"
#include "mom_mach.h"
#include "mom_func.h"
#include "pbs_error.h"
#include "svrfunc.h"
#include "md5.h"

static char ident[] = "@(#) $RCSfile: start_exec.c,v $ $Revision: 2.20.2.1 $";

#define EXTRA_VARIABLE_SPACE 2000
#define EXTRA_ENV_PTRS	       32

/* Global Variables */


extern  int		num_var_env;
extern	char	      **environ;
extern	int		exiting_tasks;
extern	int		lockfds;
extern	list_head	mom_polljobs;
extern	char		*path_checkpoint;
extern	char		*path_jobs;
extern	char		*path_prolog;
extern	char		*path_spool;
extern	char		*path_home;
extern	gid_t		 pbsgroup;
extern	time_t		time_now;
extern	unsigned int	pbs_rm_port;
extern	u_long		localaddr;

int              mom_reader_go;		/* see catchinter() & mom_writer() */
struct var_table vtable;		/* for building up Job's environ */

/* Local Varibles */ 

static int	 script_in;	/* script file, will be stdin	  */
static pid_t	 writerpid;	/* writer side of interactive job */
static pid_t	 shellpid;	/* shell part of interactive job  */


static	char *variables_else[] = {	/* variables to add, value computed */
	"HOME",
	"LOGNAME",
	"PBS_JOBNAME",
	"PBS_JOBID",
	"PBS_QUEUE",
	"SHELL",
	"USER",
	"PBS_JOBCOOKIE",
	"PBS_NODENUM",
	"PBS_TASKNUM",
	"PBS_MOMPORT",
	"PBS_NODEFILE"
};

static	char *variables_env[NUM_LCL_ENV_VAR];

static	int num_var_else = sizeof(variables_else) / sizeof(char *);

static	void starter_return A_((int upfds, int downfds, int code, struct startjob_rtn *));
static	void catchinter A_((int));

#define FDMOVE(fd)      if (fd < 3) { \
	int     hold = fcntl(fd, F_DUPFD, 3); \
	(void)close(fd); \
	fd = hold; \
}

/*
 * no_hang() - interrupt handler for alarm() around attempt to connect
 *	to qsub for interactive jobs.   If qsub hung or suspended or if the
 *	network is fouled up, mom cannot afford to wait forever.
 */

static void no_hang(sig)
	int sig;
{
	LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_REQUEST, " ",
		  "alarm timed-out connect to qsub");
}

struct passwd *
check_pwd(pjob)
	job *pjob;
{
	struct passwd		*pwdp;
	struct group		*grpp;

	pwdp = getpwnam(pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str);
	if (pwdp == (struct passwd *)0) {
		(void)sprintf(log_buffer, "No Password Entry for User %s",
			      pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str);
		return NULL;
	}
	pjob->ji_qs.ji_un.ji_momt.ji_exuid = pwdp->pw_uid;
	pjob->ji_grpcache = malloc(sizeof (struct grpcache) +
				   strlen(pwdp->pw_dir) + 1);
	if (pjob->ji_grpcache == (struct grpcache *)0) {
		(void)sprintf(log_buffer, "Malloc failed");
		return NULL;
	}
	(void)strcpy(pjob->ji_grpcache->gc_homedir, pwdp->pw_dir);

	/* get the group and supplimentary under which the job is to be run */

	if ( (pjob->ji_wattr[(int)JOB_ATR_egroup].at_flags & 
	     (ATR_VFLAG_SET | ATR_VFLAG_DEFLT)) == ATR_VFLAG_SET)  {

		/* execution group specified and not defautl of login group */

		grpp = getgrnam(pjob->ji_wattr[(int)JOB_ATR_egroup].
				at_val.at_str);
		if (grpp == (struct group *)0) {
			(void)sprintf(log_buffer, "No Group Entry for Group %s",
			     pjob->ji_wattr[(int)JOB_ATR_egroup].at_val.at_str);
			return NULL;
		}
		pjob->ji_qs.ji_un.ji_momt.ji_exgid = grpp->gr_gid;
	} else {
		/* default to login group */
		pjob->ji_qs.ji_un.ji_momt.ji_exgid = pwdp->pw_gid;	
	}
	if ((pjob->ji_grpcache->gc_ngroup =
			init_groups(pwdp->pw_name,
				pjob->ji_qs.ji_un.ji_momt.ji_exgid,
				NGROUPS_MAX,
				pjob->ji_grpcache->gc_groups)) < 0) {
		(void)sprintf(log_buffer, "Too many group entires");
		return NULL;
	}

	/* perform site specific check on validatity of account */

	if (site_mom_chkuser(pjob)) {
		(void)sprintf(log_buffer, "site_mom_chkuser faild");
		return NULL;
	}
	return pwdp;
}

int mom_restart_job(pjob, path)
    job		*pjob;
    char	*path;
{
	static char	id[] = "mom_restart_job";
	int		i;
	char		namebuf[MAXPATHLEN];
	char		*filnam;
	DIR		*dir;
	struct	dirent	*pdir;
	tm_task_id	taskid;
	task		*ptask;
	int		tcount = 0;
	long		mach_restart A_((task *, char *path));

	if ((dir = opendir(path)) == NULL) {
		sprintf(log_buffer, "opendir %s", path);
		log_err(errno, id, log_buffer);
		return -1;
	}

	(void)strcpy(namebuf, path);
	(void)strcat(namebuf, "/");
	i = strlen(namebuf);
	filnam = &namebuf[i];
	while ((pdir = readdir(dir)) != NULL) {
		if (strlen(pdir->d_name) <= 2)
			continue;

		if ((taskid = (tm_task_id)atoi(pdir->d_name)) == 0) {
			sprintf(log_buffer, "%s: garbled filename %s",
				pjob->ji_qs.ji_jobid, pdir->d_name);
			goto fail;
		}
		if ((ptask = task_find(pjob, taskid)) == NULL) {
			sprintf(log_buffer, "%s: task %ld not found",
				pjob->ji_qs.ji_jobid, taskid);
			goto fail;
		}

		(void)strcpy(filnam, pdir->d_name);
		if (mach_restart(ptask, namebuf) == -1) {
			sprintf(log_buffer, "%s: task %ld failed from file %s",
				pjob->ji_qs.ji_jobid, taskid, namebuf);
			goto fail;
		}

		ptask->ti_qs.ti_status = TI_STATE_RUNNING;
		(void)task_save(ptask);
		tcount++;
	}
	closedir(dir);
	return tcount;

 fail:
	log_err(errno, id, log_buffer);
	closedir(dir);
	return -1;
}

void
exec_bail(pjob, code)
	job	*pjob;
	int	code;
{
	pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
	pjob->ji_qs.ji_un.ji_momt.ji_exitstat = code;
	exiting_tasks = 1;
	if (pjob->ji_stdout > 0)
		(void)close(pjob->ji_stdout);
	if (pjob->ji_stderr > 0)
		(void)close(pjob->ji_stderr);
	return;
}

#define	RETRY	3

int
open_demux(addr, port)
	u_long	addr;
	int	port;
{
	static	char	id[] = "open_demux";
	int	sock;
	int	i;
	struct	sockaddr_in	remote;

	remote.sin_addr.s_addr = addr;
	remote.sin_port = htons((unsigned short)port);
	remote.sin_family = AF_INET;

	if ((sock = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
		sprintf(log_buffer, "%s: socket %s", id, netaddr(&remote));
		log_err(errno, id, log_buffer);
		return -1;
	}

	for (i=0; i<RETRY; i++) {
		if (connect(sock, (struct sockaddr *)&remote,
				sizeof(remote)) == 0)
			return sock;

		switch (errno) {

		case EINTR:
		case EADDRINUSE:
		case ETIMEDOUT:
		case ECONNREFUSED:
			sleep(2);
			continue;

		default:
			break;
		}
		break;
	}
	sprintf(log_buffer, "%s: connect %s", id, netaddr(&remote));
	log_err(errno, id, log_buffer);
	(void)close(sock);
	return -1;
}

/*
 * open_pty - open slave side of master/slave pty
 */

static int open_pty(pjob)
	job	*pjob;
{
	char	*name;
	int	 pts;

	/* Open the slave pty as the controlling tty */

	name = pjob->ji_wattr[(int)JOB_ATR_outpath].at_val.at_str;

	if ((pts = open(name, O_RDWR, 0600)) < 0) {
		log_err(errno, "open_pty", "cannot open slave");
	} else {

		FDMOVE(pts);
	
		(void)fchmod(pts, 0620);
		(void)fchown(pts, pjob->ji_qs.ji_un.ji_momt.ji_exuid,
				  pjob->ji_qs.ji_un.ji_momt.ji_exgid);
#ifdef _CRAY
		(void)ioctl(0, TCCLRCTTY, 0);
		(void)ioctl(pts, TCSETCTTY, 0); /* make controlling */
#endif	/* _CRAY */
	}
	return (pts);
}
/*
 * is_joined - determine if stdard out and stardard error are joined together
 *	(-j option) and if so which is first
 *	Returns: 0 - no join, separate files
 *		+1 - joined as stdout
 *		-1 - joined as stderr
 */

is_joined(pjob)
	job *pjob;
{
	attribute *pattr;

	pattr = &pjob->ji_wattr[(int)JOB_ATR_join];
	if ( (pattr->at_flags & ATR_VFLAG_SET) &&
	     (pattr->at_val.at_str[0] != 'n') ) {
		if ( (pattr->at_val.at_str[0] == 'o') &&
		     (strchr(pattr->at_val.at_str, (int)'e') != 0) ) {
			return 1;
		} else if ((pattr->at_val.at_str[0] == 'e') &&
			   (strchr(pattr->at_val.at_str, (int)'e') != 0) ) {
			return -1;
		}
	}
	return 0;
}


/* 
 * open_std_out_err - open standard out and err to files
 */

static int open_std_out_err(pjob)
	job *pjob;
{
	int	   i;
	int	   file_out = -2;
	int	   file_err = -2;
	int	   filemode = O_CREAT | O_WRONLY | O_APPEND;

	/* if std out/err joined (set and !="n"),which file is first */
	
	i = is_joined(pjob);
	if (i == 1) {
		file_out = open_std_file(pjob, StdOut, filemode,
				   	   pjob->ji_qs.ji_un.ji_momt.ji_exgid);
		file_err = dup(file_out);
	} else if (i == -1) {
		file_err = open_std_file(pjob, StdErr, filemode,
			   	   pjob->ji_qs.ji_un.ji_momt.ji_exgid);
		file_out = dup(file_err);
	}

	if (file_out == -2)
	    file_out = open_std_file(pjob, StdOut, filemode,
				    pjob->ji_qs.ji_un.ji_momt.ji_exgid);
	if (file_err == -2)
	    file_err = open_std_file(pjob, StdErr, filemode,
				    pjob->ji_qs.ji_un.ji_momt.ji_exgid);
	if ((file_out < 0 || file_err < 0)) {
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
			   pjob->ji_qs.ji_jobid,
			   "Unable to open standard output/error");
		return -1;
	}

	FDMOVE(file_out);	/* make sure descriptor > 2       */
	FDMOVE(file_err);	/* so don't clobber stdin/out/err */
	if (file_out != 1) {
		(void)close(1);
		(void)dup(file_out);
		(void)close(file_out);
	}
	if (file_err != 2) {
		(void)close(2);
		(void)dup(file_err);
		(void)close(file_err);
	}
	return 0;
}

/*
** Used by MOM supierior to start the shell process.
*/
void
finish_exec(pjob)
	job *pjob;
{
	static char 		*id = "finish_exec";
	struct sigaction	act;
	char			*arg[2];
	char	   		buf[MAXPATHLEN+2];
	pid_t      		cpid;
	struct passwd		*pwdp;		/* for uid, shell, home dir */
	int	   		i, j, vnodenum;
	int	   		is_interactive = 0;
	attribute		*pattr;
	attribute		*pattri;
	char	  		*phost;
	resource		*presc;
	resource_def		*prd;
	int	   		pipe_script[2];
	char			*ptc_name;
	int	   		ptc = -1;
	int	   		pts;
	int			qsub_sock;
	char			*shell;
	char			*shellname;
	int			jsmpipe[2];	/* job starter to MOM for sid */
	int			upfds;
	int			mjspipe[2];	/* MOM to job starter for ack */
	int			downfds;
	int			port_out, port_err;
	struct startjob_rtn	sjr;
	char			*termtype;
	task			*ptask;
	struct	array_strings	*vstrs;
	struct	stat		sb;
	struct	sockaddr_in	saddr;

	if ( pjob->ji_numnodes > 1 ) {
		/*
		** Get port numbers from file decriptors in job struct.  The
		** sockets are stored there so they can be closed later as
		** Main MOM will not need them after the job is going.
		*/
		i = sizeof(saddr);
		if (getsockname(pjob->ji_stdout,
				(struct sockaddr *)&saddr, &i) == -1) {
			(void)sprintf(log_buffer, "getsockname on stdout");
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}
		port_out = (int)ntohs(saddr.sin_port);
	
		i = sizeof(saddr);
		if (getsockname(pjob->ji_stderr,
				(struct sockaddr *)&saddr, &i) == -1) {
			(void)sprintf(log_buffer, "getsockname on stderr");
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}
		port_err = (int)ntohs(saddr.sin_port);
	} else {
		port_out = -1;
		port_err = -1;
	}

	/* Did the job request nodes, will need to setup node file */

	pattr = &pjob->ji_wattr[(int)JOB_ATR_resource];
	prd = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);
	presc = find_resc_entry(pattr, prd);
	if (presc != NULL) 
		pjob->ji_flags |= MOM_HAS_NODEFILE;

	/*
	 * get the password entry for the user under which the job is to be run
	 * we do this now to save a few things in the job structure
	 */
	if ((pwdp = check_pwd(pjob)) == NULL) {
		LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			  pjob->ji_qs.ji_jobid, log_buffer);
		exec_bail(pjob, JOB_EXEC_FAIL1);
		return;
	}
#if IBM_SP2==2        /* IBM SP with PSSP 3.1 */

	/* load IBM SP switch table */
	if (load_sp_switch(pjob) != 0) {
		exec_bail(pjob, JOB_EXEC_RETRY);
		return;
	}
#endif				/* IBM SP */

	/*
	 * if certain resource limits require that the job usage be
	 * polled or it is a multinode job, we link the job to mom_polljobs.
	 *
	 * NOTE: we overload the job field ji_jobque for this as it
	 * is not used otherwise by MOM
	 */

	if ( pjob->ji_numnodes > 1 || mom_do_poll(pjob) )
		append_link(&mom_polljobs, &pjob->ji_jobque, pjob);

#if MOM_CHECKPOINT == 1
	/* Is the job to be periodic checkpointed */

	pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt];
	if ( (pattr->at_flags & ATR_VFLAG_SET) &&
	     (*pattr->at_val.at_str == 'c') &&
	     (*(pattr->at_val.at_str+1) == '=') ) {
		/* has checkpoint time (in minutes), convert to milliseconds */
		pjob->ji_chkpttime = atoi(pattr->at_val.at_str+2) * 60;
		pjob->ji_chkptnext = pjob->ji_chkpttime;
	}

	/* If job has been checkpointed, restart from the checkpoint image */

	strcpy(buf, path_checkpoint);
	strcat(buf, pjob->ji_qs.ji_fileprefix);
	strcat(buf, JOB_CKPT_SUFFIX);
	if ( ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHKPT) || 
	      (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ChkptMig)) &&
	     (stat(buf,&sb) == 0) ) {

		/* Checkpointed - restart from checkpoint file */

		/* perform any site required setup before restart */
		if ((i = site_mom_prerst(pjob)) != 0) {
		    pjob->ji_qs.ji_un.ji_momt.ji_exitstat = i;
		    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
		    exiting_tasks = 1;
		    (void)sprintf(log_buffer,"Pre-restart failed %d",errno);
		}

		if ((i = mom_restart_job(pjob, buf)) > 0) {
		    (void)sprintf(log_buffer, "Restarted %d tasks", i);

		    /* reset mtime so walltime will not include held time */
		    /* update to time now minus the time already used	  */
		    /* unless it is suspended, see request.c/req_signal() */

		    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) {
		    	pjob->ji_qs.ji_stime = time_now - 
				( sb.st_mtime - pjob->ji_qs.ji_stime);
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
			if (mom_get_sample() != PBSE_NONE)
				(void)mom_set_use(pjob);
		    } else {
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_SUSPEND;
		    }
		} else {	/* retry for any kind of changable thing */
		    if ((errno == EAGAIN) ||
#ifdef	ERFLOCK
			(errno == ERFLOCK) ||
#endif
#ifdef	EQUSR
			(errno == EQUSR) ||
#endif
#ifdef	EQGRP
			(errno == EQGRP) ||
#endif
#ifdef	EQACT
			(errno == EQACT) ||
#endif
#ifdef	ENOSDS
			(errno == ENOSDS) ||
#endif
			(errno == ENOMEM) ||
			(errno == ENOLCK) ||
			(errno == ENOSPC) ||
			(errno == ENFILE) ||
			(errno == EDEADLK) ||
			(errno == EBUSY))
		        pjob->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_RETRY;
		    else {
		        pjob->ji_qs.ji_un.ji_momt.ji_exitstat =
				JOB_EXEC_BADRESRT;
		    }
		    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
		    exiting_tasks = 1;
		    (void)sprintf(log_buffer, "Restart failed, error %d",errno);
		}
		return;
	}
#endif	/* MOM_CHECKPOINT */

	pattri = &pjob->ji_wattr[(int)JOB_ATR_interactive];
	if ( (pattri->at_flags & ATR_VFLAG_SET) &&
	     (pattri->at_val.at_long != 0) ) {

		is_interactive = 1;
		
		/*
		 * open a master pty, need to do it here before we fork,
		 * to save the slave name in the master's job structure
		 */

		if ((ptc = open_master(&ptc_name)) < 0) {
			log_err(errno, id ,"cannot open master pty");
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}
		FDMOVE(ptc)

		/* save pty name in job output/error file name */

		pattr = &pjob->ji_wattr[(int)JOB_ATR_outpath];
		job_attr_def[(int)JOB_ATR_outpath].at_free(pattr);
		(void)job_attr_def[(int)JOB_ATR_outpath].at_decode(
				pattr, (char *)0, (char *)0, ptc_name);
		pattr = &pjob->ji_wattr[(int)JOB_ATR_errpath];
		job_attr_def[(int)JOB_ATR_errpath].at_free(pattr);
		(void)job_attr_def[(int)JOB_ATR_errpath].at_decode(
				pattr, (char *)0, (char *)0, ptc_name);

#if SHELL_INVOKE == 1

	} else {
		/* need a pipe on which to write the shell script 	*/
		/* file name to the input of the shell			*/

		if (pipe(pipe_script) == -1) {
			(void)sprintf(log_buffer,
					"Failed to create shell name pipe");
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}
#endif /* SHELL_INVOKE */

	}

	/* create pipes between MOM and the job starter    */
	/* fork the job starter which will become the job */

	if ((pipe(mjspipe) == -1) || (pipe(jsmpipe) == -1)) {
		i = -1;

	} else {

		i = 0;

		/* make sure pipe file descriptors are above 2 */

		if (jsmpipe[1] < 3) {
			upfds = fcntl(jsmpipe[1], F_DUPFD, 3);
			(void)close(jsmpipe[1]);
		} else
			upfds = jsmpipe[1];
		if (mjspipe[0] < 3) {
			downfds = fcntl(mjspipe[0], F_DUPFD, 3);
			(void)close(mjspipe[0]);
		} else
			downfds = mjspipe[0];
	}
	if ( (i == -1) || (upfds < 3) || (downfds < 3) ) {
		(void)sprintf(log_buffer,"Failed to create communication pipe");
		exec_bail(pjob, JOB_EXEC_RETRY);
		return;
	}
	if ((ptask = task_create(pjob, TM_NULL_TASK)) == NULL) {
		(void)sprintf(log_buffer, "Task creation failed");
		exec_bail(pjob, JOB_EXEC_RETRY);
		return;
	}

	pjob->ji_qs.ji_stime = time_now;
	/*
	** Fork the child that will become the job.
	*/
	cpid = fork_me(-1);
	if (cpid > 0) {
		/* the parent side, still the main man, uhh that is MOM */

		(void)close(upfds);
		(void)close(downfds);
		if (ptc >= 0)
			(void)close(ptc);

		(void)strcpy(buf, path_jobs);
		(void)strcat(buf, pjob->ji_qs.ji_fileprefix);
		(void)strcat(buf, JOB_SCRIPT_SUFFIX);
		(void)chown(buf, pjob->ji_qs.ji_un.ji_momt.ji_exuid,
				 pjob->ji_qs.ji_un.ji_momt.ji_exgid);
#if SHELL_INVOKE == 1
		if (is_interactive == 0) {
			int k;

			/* pass name of shell script on pipe	*/
			/* will be stdin of shell 		*/
	
			(void)close(pipe_script[0]);
			(void)strcat(buf, "\n");	/* setup above */
			i = strlen(buf);
			j = 0;
			while (j < i) {
				if ((k=write(pipe_script[1], buf+j, i-j)) < 0) {
					if (errno == EINTR)
						continue;
					break;
				}
				j += k;
			}	
			(void)close(pipe_script[1]);
		}
#endif	/* SHELL_INVOKE */

		
		/* now we read the session id or error */
		for(;;) {
			i = read(jsmpipe[0], (char *)&sjr, sizeof(sjr));
			if (i == -1 && errno == EINTR)
				continue;
			break;
		}
		j = errno;
		(void)close(jsmpipe[0]);
		if (i != sizeof(sjr)) {
			sprintf(log_buffer,
				"read of pipe for pid job %s got %d not %d",
				pjob->ji_qs.ji_jobid,
				i, (int)sizeof(sjr));
			log_err(j, id, log_buffer);
			(void)sprintf(log_buffer, "start failed,improper sid");
			(void)close(mjspipe[1]);
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}
		/* send back as an acknowledgement that MOM got it */
		(void)write(mjspipe[1], &sjr, sizeof(sjr));
		(void)close(mjspipe[1]);
		DBPRT(("%s: read start return %d %d\n", id,
			sjr.sj_code, sjr.sj_session))
		if (sjr.sj_code < 0) {
			(void)sprintf(log_buffer, "job not started, %s %d",
				      (sjr.sj_code==JOB_EXEC_RETRY)?
					"Retry" : "Failure", sjr.sj_code);
			exec_bail(pjob, sjr.sj_code);
			return;
		}

		set_globid(pjob, &sjr);
		ptask->ti_qs.ti_sid = sjr.sj_session;
		ptask->ti_qs.ti_status = TI_STATE_RUNNING;
		strcpy(ptask->ti_qs.ti_parentjobid, pjob->ji_qs.ji_jobid);
		if (task_save(ptask) == -1) {
			(void)sprintf(log_buffer, "Task save failed");
			exec_bail(pjob, JOB_EXEC_RETRY);
			return;
		}


		if (pjob->ji_numnodes > 1) {
			/*
			** Put port numbers into job struct and close sockets.
			** The job uses them to talk to demux, but main MOM
			** doesn't need them.   The port numbers are stored 
			** here for use in start_process(), to connect to 
			** pbs_demux.
			*/
			(void)close(pjob->ji_stdout);
			pjob->ji_stdout = port_out;
			(void)close(pjob->ji_stderr);
			pjob->ji_stderr = port_err;
		}

		/* return from the starter indicated the job is a go ... */
		/* record the start time and session/process id		 */

		pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long
				= sjr.sj_session;
		pjob->ji_wattr[(int)JOB_ATR_session_id].at_flags =
					ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
		pjob->ji_qs.ji_state = JOB_STATE_RUNNING;
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
		job_save(pjob, SAVEJOB_QUICK);

		sprintf(log_buffer, "Started, pid = %d", sjr.sj_session);
		/* the message is "logged" in req_commit() */

		return;

	} else if ( cpid < 0) {
		(void)sprintf(log_buffer, "Fork failed in %s: %d\n",
			      id, errno);
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, 
			   pjob->ji_qs.ji_jobid, log_buffer);
		exec_bail(pjob, JOB_EXEC_RETRY);
		return;
	}

	/************************************************/
	/*						*/
	/* The child process - will become THE JOB	*/
	/*						*/
	/************************************************/

	(void)close(lockfds);
	(void)close(jsmpipe[0]);
	(void)close(mjspipe[1]);

	/*
	 * find which shell to use, one specified or the login shell
	 */

	shell = set_shell(pjob, pwdp);	/* in the machine dependent section */

	/*
	 * set up the Environmental Variables to be given to the job 
	 */

	vstrs = pjob->ji_wattr[(int)JOB_ATR_variables].at_val.at_arst;
	vtable.v_bsize = (vstrs->as_next - vstrs->as_buf) +EXTRA_VARIABLE_SPACE;
	vtable.v_block = malloc(vtable.v_bsize);
	vtable.v_ensize = vstrs->as_usedptr + num_var_else + num_var_env +
			  EXTRA_ENV_PTRS;
	vtable.v_used   = 0;
	vtable.v_envp = (char **)malloc(vtable.v_ensize * sizeof(char *));

	/*  First variables from the local environment */

	for (j = 0; j < num_var_env; ++j) 
		bld_env_variables(&vtable, environ[j], (char *)0);

	/* Second, the variables passed with the job.  They may */
	/* be overwritten with new correct values for this job	*/

	for (j = 0; j < vstrs->as_usedptr; ++j)
		bld_env_variables(&vtable, vstrs->as_string[j], (char *)0);

	/* .. Next the critical variables: home, path, logname, ... */
	/* these may replace some passed in with the job	    */

	/* HOME */
	bld_env_variables(&vtable, variables_else[0], pwdp->pw_dir); /* HOME */

	/* LOGNAME */
	bld_env_variables(&vtable, variables_else[1], pwdp->pw_name);

	/* PBS_JOBNAME */
	bld_env_variables(&vtable, variables_else[2], 
			  pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);

	/* PBS_JOBID */
	bld_env_variables(&vtable, variables_else[3], pjob->ji_qs.ji_jobid);

	/* PBS_QUEUE */
	bld_env_variables(&vtable, variables_else[4],
			 pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str);

	/* SHELL */
	bld_env_variables(&vtable, variables_else[5], shell);

	/* USER, for compatability */
	bld_env_variables(&vtable, variables_else[6], pwdp->pw_name);

	/* PBS_JOBCOOKIE */
	bld_env_variables(&vtable, variables_else[7],
		pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str);

	/* PBS_NODENUM */
	sprintf(buf, "%d", pjob->ji_nodeid);
	bld_env_variables(&vtable, variables_else[8], buf);

	/* PBS_TASKNUM */
	sprintf(buf, "%ld", ptask->ti_qs.ti_task);
	bld_env_variables(&vtable, variables_else[9], buf);

	/* PBS_MOMPORT */
	sprintf(buf, "%d", pbs_rm_port);
	bld_env_variables(&vtable, variables_else[10], buf);

	/* PBS_NODEFILE */
	vnodenum = pjob->ji_numvnod;
	if (pjob->ji_flags & MOM_HAS_NODEFILE) {
		FILE	*nhow;

		sprintf(buf, "%s/aux/%s", path_home, pjob->ji_qs.ji_jobid);
		bld_env_variables(&vtable, variables_else[11], buf);

		if ((nhow = fopen(buf, "w")) == NULL) {
			sprintf(log_buffer, "cannot open %s", buf);
			log_err(errno, id, log_buffer);
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}
		/*
		**	The file must be owned by root and readable by
		**	the user.  We take the easy way out and make
		**	it readable by anyone.
		*/
		if (fchmod(fileno(nhow), 0644) == -1) {
			sprintf(log_buffer, "cannot chmod %s", buf);
			log_err(errno, id, log_buffer);
			fclose(nhow);
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}
		for (j=0; j<vnodenum; j++) {
			vnodent	*vp = &pjob->ji_vnods[j];

			fprintf(nhow, "%s\n", vp->vn_host->hn_host);
		}
		fclose(nhow);
	}

	/* specific system related variables */

	j = set_mach_vars(pjob, &vtable);
	if (j != 0) {
		starter_return(upfds, downfds, j, &sjr);	/* exits */
	}
	
	umask(077);

	if (is_interactive) {
		struct	sigaction	act;

/*************************************************************************/
/*		We have an "interactive" job, connect the standard	 */
/*		streams to a socket connected to qsub.			 */
/*************************************************************************/

		sigemptyset(&act.sa_mask);
#ifdef SA_INTERRUPT
		act.sa_flags   = SA_INTERRUPT;
#else
		act.sa_flags   = 0;
#endif /* SA_INTERRUPT */
		act.sa_handler = no_hang;
		(void)sigaction(SIGALRM, &act, (struct sigaction *)0);
		alarm(30);


		/* Set environment to reflect interactive */

		bld_env_variables(&vtable,"PBS_ENVIRONMENT","PBS_INTERACTIVE");

		/* get host where qsub resides */

		phost = arst_string("PBS_O_HOST", &pjob->ji_wattr[(int)JOB_ATR_variables]);
		if ( (phost == (char *)0) ||
		     ((phost = strchr(phost, (int)'=')) == (char *)0) ) {
			log_err(-1, id, "PBS_O_HOST not set");
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
                }
		
		qsub_sock = conn_qsub(phost+1, pattri->at_val.at_long);
		if (qsub_sock < 0) {
			log_err(errno, id ,"cannot open qsub sock");
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}
		FDMOVE(qsub_sock);

		/* send job id as validation to qsub */

		if (write(qsub_sock, pjob->ji_qs.ji_jobid,PBS_MAXSVRJOBID+1) !=
							  PBS_MAXSVRJOBID+1) {
			log_err(errno, id ,"cannot write jobid");
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}

		/* receive terminal type and window size */

		if ((termtype = rcvttype(qsub_sock)) == (char *)0) 
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);

		bld_env_variables(&vtable, termtype, (char *)0);
		*(vtable.v_envp + vtable.v_used) = (char *)0;	/* null term */
		if (rcvwinsize(qsub_sock) == -1)
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);

		/* turn off alarm set around qsub connect activities */

		alarm(0);
		act.sa_handler = SIG_DFL;
		act.sa_flags   = 0;
		(void)sigaction(SIGALRM, &act, (struct sigaction *)0);

		/* set up the Job session */

		j = set_job(pjob, &sjr);
		if (j < 0) {
#ifndef NDEBUG
			if (j == -1) {
			    /* set_job didn't leave message in log_buffer */
			    (void)strcpy(log_buffer,"Unable to set session");
			}
			log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
				   pjob->ji_qs.ji_jobid, log_buffer);
#endif	/* NDEBUG */
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr); 
		}

		/* Open the slave pty as the controlling tty */
			
		if ((pts = open_pty(pjob)) < 0) {
			log_err(errno, id,"cannot open slave");
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}

		act.sa_handler = SIG_IGN;	/* setup to ignore SIGTERM */ 

		writerpid = fork();
		if (writerpid == 0) {
			/* child is "writer" process */

			(void)sigaction( SIGTERM, &act, NULL);

			(void)close(upfds);
			(void)close(downfds);
			(void)close(pts);

			mom_writer(qsub_sock, ptc);

			shutdown(qsub_sock, 2);
			exit(0);

		} else if (writerpid > 0) {
			/*
			** parent -- it first runs the prolog then forks
			** again.  the child becomes the job while the
			** parent becomes the reader.
			*/

			(void)close(1);
			(void)close(2);
			(void)dup2(pts, 1);
			(void)dup2(pts, 2);
			fflush(stdout);
			fflush(stderr);
			set_termcc(pts);	/* set terminal control char */
			(void)setwinsize(pts);	/* set window size to qsub's */

			/* run prolog */

			if (run_pelog(PE_PROLOGUE,
					path_prolog, pjob,
					PE_IO_TYPE_ASIS) != 0) {
				(void)fprintf(stderr,
					"Could not run prolog: %s\n",
					log_buffer);
				starter_return(upfds, downfds, 
					       JOB_EXEC_FAIL2, &sjr);
			}

			shellpid = fork();
			if (shellpid == 0) {

				/*********************************************/
				/* child - this will be the interactive job  */
				/* i/o is to slave tty			     */
				/*********************************************/

				(void)close(0);
				(void)dup2(pts, 0);
				fflush(stdin);

				(void)close(ptc);	/* close master side */
				(void)close(pts);	/* dup'ed above */
				(void)close(qsub_sock);

				/* continue setting up and exec-ing shell */

			} else {
				if (shellpid > 0) {
					/* fork, parent is "reader" process  */
					(void)sigaction( SIGTERM, &act, NULL);

					(void)close(pts);
					(void)close(upfds);
					(void)close(downfds);
					(void)close(1);
					(void)close(2);

					sigemptyset(&act.sa_mask);
					act.sa_flags   = SA_NOCLDSTOP;
					act.sa_handler = catchinter;
					(void)sigaction( SIGCHLD, &act,
						(struct sigaction *)0);

					mom_reader_go = 1;
					mom_reader(qsub_sock, ptc);
				}
				else {
					log_err(errno,  id,
						"cant fork reader");
				}

				/* make sure qsub gets EOF */

				shutdown(qsub_sock, 2);

				/* change pty back to available after */
				/* job is done */
				(void)chmod(ptc_name, 0666);
				(void)chown(ptc_name, 0, 0);
				exit(0);
			}
		} else { /* error */
			log_err(errno, id ,"cannot fork nanny");

			/* change pty back to available */
			(void)chmod(ptc_name, 0666);
			(void)chown(ptc_name, 0, 0);

			starter_return(upfds, downfds, JOB_EXEC_RETRY, &sjr);
		}

	} else {

/*************************************************************************/
/*		We have a "normal" batch job, connect the standard	 */
/*		streams to files					 */
/*************************************************************************/

		/* set Environment to reflect batch */

		bld_env_variables(&vtable,"PBS_ENVIRONMENT", "PBS_BATCH");
		bld_env_variables(&vtable, "ENVIRONMENT", "BATCH");
		
	
#if SHELL_INVOKE == 1
		/* if passing script file name as input to shell */

		(void)close(pipe_script[1]);
		script_in = pipe_script[0];
#else	/* SHELL_INVOKE == 0 */
		/* if passing script itself as input to shell */

		(void)strcpy(buf, path_jobs);
		(void)strcat(buf, pjob->ji_qs.ji_fileprefix);
		(void)strcat(buf, JOB_SCRIPT_SUFFIX);
		if ((script_in = open(buf, O_RDONLY, 0)) < 0) {
			if (errno == ENOENT)
				script_in = open("/dev/null", O_RDONLY, 0);
		}
#endif	/* SHELL_INVOKE */
	
		if (script_in < 0) {
			log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
				   pjob->ji_qs.ji_jobid,
				   "Unable to open script");
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}
		FDMOVE(script_in);	/* make sure descriptor > 2       */
		if (script_in != 0) {
		    (void)close(0);
		    (void)dup(script_in);
		    (void)close(script_in);
		}

		if (open_std_out_err(pjob) == -1) {
			starter_return(upfds, downfds, JOB_EXEC_FAIL1, &sjr);
		}

		/* run prolog */
		
		if ((j = run_pelog(PE_PROLOGUE,
				   path_prolog, pjob, PE_IO_TYPE_ASIS)) == 1) {
			/* abort job */
			(void)fprintf(stderr,
				"Could not run prolog: %s\n", log_buffer);
			starter_return(upfds, downfds, JOB_EXEC_FAIL2, &sjr);
		} else if (j != 0) {
			/* requeue job */
			starter_return(upfds, downfds, JOB_EXEC_RETRY, &sjr);
		}

		/* set up the Job session */

		j = set_job(pjob, &sjr);
		if (j < 0) {
			if (j == -1) {
			    /* set_job didn't leave message in log_buffer */
			    (void)strcpy(log_buffer,"Unable to set session");
			}
			/* set_job leaves message in log_buffer */
			(void)fprintf(stderr, "%s\n", log_buffer);

#ifndef NDEBUG
			log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
				   pjob->ji_qs.ji_jobid, log_buffer);
#endif	/* NDEBUG */
			starter_return(upfds, downfds, JOB_EXEC_FAIL2, &sjr);
		}

	}

/*************************************************************************/
/*	Set resource limits				 		 */
/*	Both normal batch and interactive job come through here 	 */
/*************************************************************************/

	pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long = sjr.sj_session;

	if (site_job_setup(pjob) != 0) {
		starter_return(upfds, downfds, JOB_EXEC_FAIL2, &sjr);/* exits */
	}

	if ( (i = mom_set_limits(pjob, SET_LIMIT_SET)) != PBSE_NONE ) {
		(void)sprintf(log_buffer, "Unable to set limits, err=%d", i);
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
			   pjob->ji_qs.ji_jobid, log_buffer);
		if (i == PBSE_RESCUNAV)	{	/* resource temp unavailable */
			if (is_interactive)
				j = JOB_EXEC_FAIL2;
			else
				j = JOB_EXEC_RETRY;
		}
		else
			j = JOB_EXEC_FAIL2;
		starter_return(upfds, downfds, j, &sjr);	 /* exits */
	}

	/* NULL terminate the envp array, This is MUST DO	*/
	*(vtable.v_envp + vtable.v_used) = (char *)0;

	endpwent();

	/*
	 * become the user and  execv the shell and become the real job 
	 */

	(void)setgroups(pjob->ji_grpcache->gc_ngroup,
			(gid_t *)pjob->ji_grpcache->gc_groups);
	(void)setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid);
	(void)setuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid);
#ifdef _CRAY
	(void)seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid); /* cray kludge */
#endif	/* CRAY */

	/*
	 * change working directory to User's Home
	 */

	if (chdir(pwdp->pw_dir) == -1) {
		log_record(PBSEVENT_JOB | PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB,
			   pjob->ji_qs.ji_jobid,
			   "Could not chdir to Home directory");
		(void)fprintf(stderr, "Could not chdir to home directory\n");
		starter_return(upfds, downfds, JOB_EXEC_FAIL2, &sjr);
	}
	
	/* tell mom we are going */
	starter_return(upfds, downfds, JOB_EXEC_OK, &sjr);
	log_close(0);

	if ( (pjob->ji_numnodes == 1) ||
	    ((cpid = fork()) > 0) ) {	/* parent does the shell */
		/* close sockets that child uses */
		(void)close(pjob->ji_stdout);
		(void)close(pjob->ji_stderr);

		/* construct argv array */
		shellname = strrchr(shell, '/');
		if (shellname)
			++shellname;	/* go past last '/' */
		else
			shellname = shell;
		arg[0] = malloc(strlen(shellname) + 2);
		strcpy(arg[0], "-");
		strcat(arg[0], shellname);
		arg[1] = (char *)0;

#if 0		/* def DEBUG */
		for (i=3; i< 40; ++i) {	/* check for any extra descriptors */
			if (close(i) >= 0)
				fprintf(stderr, "Closed shell file %d\n", i);
		}
#endif	/* DEBUG */
		execve(shell, arg, vtable.v_envp);
	}
	else if (cpid == 0) {		/* child does demux */
		char	*demux = DEMUX;

		/* setup descriptors 3 and 4 */
		(void)dup2(pjob->ji_stdout, 3);
		if (pjob->ji_stdout > 3)
			close(pjob->ji_stdout);
		(void)dup2(pjob->ji_stderr, 4);
		if (pjob->ji_stderr > 4)
			close(pjob->ji_stderr);

		/* construct argv array */
		shellname = strrchr(demux, '/');
		if (shellname)
			++shellname;	/* go past last '/' */
		else
			shellname = shell;
		arg[0] = malloc(strlen(shellname) + 1);
		strcpy(arg[0], shellname);
		arg[1] = (char *)0;

#if 0		/* def DEBUG */
		for (i=5; i< 40; ++i) {	/* check for any extra descriptors */
			if (close(i) >= 0)
				fprintf(stderr, "Closed demux file %d\n", i);
		}
#endif	/* DEBUG */
		execve(demux, arg, vtable.v_envp);
		shell = demux;  /* for fprintf below */
	}
	fprintf(stderr, "pbs_mom, exec of %s failed with error %d\n",shell, errno);
	exit(254);	/* should never, ever get here */
}

/*
** Start a process for a spawn request.  This will be different from
** a job's initial shell task in that the environment will be specified
** and no interactive code need be included.
*/
int
start_process(ptask, argv, envp)
    task	*ptask;
    char	**argv;
    char	**envp;
{
	static	char	id[] = "start_process";
	job	*pjob = ptask->ti_job;
	char	buf[MAXPATHLEN+2];
	pid_t	pid;
	int	pipes[2], kid_read, kid_write, parent_read, parent_write;
	int	pts;
	int	i, j;
	int	fd;
	u_long	ipaddr;
	struct	array_strings	*vstrs;
	struct  startjob_rtn sjr;

	if (pipe(pipes) == -1)
		return -1;
	if (pipes[1] < 3) {
		kid_write = fcntl(pipes[1], F_DUPFD, 3);
		(void)close(pipes[1]);
	}
	else
		kid_write = pipes[1];
	parent_read = pipes[0];

	if (pipe(pipes) == -1)
		return -1;
	if (pipes[0] < 3) {
		kid_read = fcntl(pipes[0], F_DUPFD, 3);
		(void)close(pipes[0]);
	}
	else
		kid_read = pipes[0];
	parent_write = pipes[1];

	/*
	** Get ipaddr to Mother Superior.
	*/
	if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE)	/* I'm MS */
		ipaddr = htonl(localaddr);
	else {
		struct	sockaddr_in	*ap;

		/*
		** We always have a stream open to MS at node 0.
		*/
		i = pjob->ji_hosts[0].hn_stream;
		if ((ap = rpp_getaddr(i)) == NULL) {
			sprintf(log_buffer, "job %s has no stream to MS",
				pjob->ji_qs.ji_jobid);
			log_err(-1, id, log_buffer);
			return -1;
		}
		ipaddr = ap->sin_addr.s_addr;
	}

	/*
	** Begin a new process for the fledgling task.
	*/
	if ((pid = fork_me(-1)) == -1)
		return -1;
	else if (pid != 0) {		/* parent */
		(void)close(kid_read);
		(void)close(kid_write);

		/* read sid */
		for(;;) {
			i = read(parent_read, (char *)&sjr, sizeof(sjr));
			if (i == -1 && errno == EINTR)
				continue;
			break;
		}
		j = errno;
		(void)close(parent_read);
		if (i != sizeof(sjr)) {
			sprintf(log_buffer,
				"read of pipe for pid job %s got %d not %d",
				pjob->ji_qs.ji_jobid, i, (int)sizeof(sjr));
			log_err(j, id, log_buffer);
			(void)close(parent_write);
			return -1;
		}
		(void)write(parent_write, &sjr, sizeof(sjr));
		(void)close(parent_write);
		DBPRT(("%s: read start return %d %d\n", id,
			sjr.sj_code, sjr.sj_session))
		if (sjr.sj_code < 0) {
			(void)sprintf(log_buffer, "task not started, %s %s %d",
					(sjr.sj_code==JOB_EXEC_RETRY)?
					"Retry" : "Failure",
					argv[0],
					sjr.sj_code);
			log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
				pjob->ji_qs.ji_jobid, log_buffer);
			return -1;
		}
		set_globid(pjob, &sjr);
		ptask->ti_qs.ti_sid = sjr.sj_session;
		ptask->ti_qs.ti_status = TI_STATE_RUNNING;
		(void)task_save(ptask);
		if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
			pjob->ji_qs.ji_state = JOB_STATE_RUNNING;
			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
			job_save(pjob, SAVEJOB_QUICK);
		}
		(void)sprintf(log_buffer, "task started, %s", argv[0]);
		log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			pjob->ji_qs.ji_jobid, log_buffer);
		return 0;
	}
	/************************************************/
	/* The child process - will become the TASK	*/
	/************************************************/
	(void)close(lockfds);
	(void)close(parent_read);
	(void)close(parent_write);

	/*
	 * set up the Environmental Variables to be given to the job 
	 */

	vstrs = pjob->ji_wattr[(int)JOB_ATR_variables].at_val.at_arst;
	vtable.v_bsize = (vstrs->as_next - vstrs->as_buf) +EXTRA_VARIABLE_SPACE;
	vtable.v_block = malloc(vtable.v_bsize);
	vtable.v_ensize = vstrs->as_usedptr + num_var_else + num_var_env +
			  EXTRA_ENV_PTRS;
	vtable.v_used   = 0;
	vtable.v_envp = (char **)malloc(vtable.v_ensize * sizeof(char *));
	
	/* First variables from the local environment */
	for (j = 0; j < num_var_env; ++j) 
		bld_env_variables(&vtable, environ[j], (char *)0);

	/* Next, the variables passed with the job.  They may   */
	/* be overwritten with new correct values for this job	*/

	for (j = 0; j < vstrs->as_usedptr; ++j)
		bld_env_variables(&vtable, vstrs->as_string[j], (char *)0);

	/* HOME */
	bld_env_variables(&vtable, variables_else[0],
		pjob->ji_grpcache->gc_homedir);

	/* PBS_JOBNAME */
	bld_env_variables(&vtable, variables_else[2], 
			  pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);

	/* PBS_JOBID */
	bld_env_variables(&vtable, variables_else[3], pjob->ji_qs.ji_jobid);

	/* PBS_QUEUE */
	bld_env_variables(&vtable, variables_else[4],
			 pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str);

	/* PBS_JOBCOOKIE */
	bld_env_variables(&vtable, variables_else[7],
		pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str);

	/* PBS_NODENUM */
	sprintf(buf, "%d", pjob->ji_nodeid);
	bld_env_variables(&vtable, variables_else[8], buf);

	/* PBS_TASKNUM */
	sprintf(buf, "%ld", ptask->ti_qs.ti_task);
	bld_env_variables(&vtable, variables_else[9], buf);

	/* PBS_MOMPORT */
	sprintf(buf, "%d", pbs_rm_port);
	bld_env_variables(&vtable, variables_else[10], buf);

	if (set_mach_vars(pjob, &vtable) != 0) {
		starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
		/* never reaches here */
	}
	umask(077);

	/* set Environment to reflect batch */
	bld_env_variables(&vtable,"PBS_ENVIRONMENT", "PBS_BATCH");
	bld_env_variables(&vtable, "ENVIRONMENT", "BATCH");

	for (i=0; envp[i]; i++)
		bld_env_variables(&vtable, envp[i], NULL);

	/* NULL terminate the envp array, This is MUST DO */
	*(vtable.v_envp + vtable.v_used) = (char *)0;

	j = set_job(pjob, &sjr);
	if (j < 0) {
#ifndef NDEBUG
		if (j == -1) {
		    /* set_job didn't leave message in log_buffer */
		    (void)strcpy(log_buffer,"Unable to set task session");
		}
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
			pjob->ji_qs.ji_jobid, log_buffer);
#endif  /* NDEBUG */
		starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
	}
	ptask->ti_qs.ti_sid = sjr.sj_session;
	if ( (i = mom_set_limits(pjob, SET_LIMIT_SET)) != PBSE_NONE ) {
		(void)sprintf(log_buffer, "Unable to set limits, err=%d", i);
		log_record(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
			pjob->ji_qs.ji_jobid, log_buffer);
		if (i == PBSE_RESCUNAV)		/* resource temp unavailable */
			j = JOB_EXEC_RETRY;
		else
			j = JOB_EXEC_FAIL2;
		starter_return(kid_write, kid_read, j, &sjr);
	}

	/* become the user and  execv the shell and become the real job */
	(void)setgroups(pjob->ji_grpcache->gc_ngroup,
			(gid_t *)pjob->ji_grpcache->gc_groups);
	(void)setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid);
	(void)setuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid);
#ifdef _CRAY
	(void)seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid); /* cray kludge */
#endif /* CRAY */

	/* change working directory to User's Home */
	if (chdir(pjob->ji_grpcache->gc_homedir) == -1) {
		log_record(PBSEVENT_JOB | PBSEVENT_SECURITY, PBS_EVENTCLASS_JOB,
			pjob->ji_qs.ji_jobid,
			"Could not chdir to Home directory");
		(void)fprintf(stderr, "Could not chdir to home directory\n");
		starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
	}

	/*
	** Set up stdin.
	*/
	if ((fd = open("/dev/null", O_RDONLY)) == -1) {
		log_err(errno, "newtask", "could not open devnull");
		(void)close(0);
	}
	else {
		(void)dup2(fd, 0);
		if (fd > 0)
			(void)close(fd);
	}

	if (pjob->ji_numnodes > 1) {
		/*
		** Open sockets to demux proc for stdout and stderr.
		*/
		if ((fd = open_demux(ipaddr, pjob->ji_stdout)) == -1)
		      starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
		(void)dup2(fd, 1);
		if (fd > 1)
			(void)close(fd);
		if ((fd = open_demux(ipaddr, pjob->ji_stderr)) == -1)
		      starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
		(void)dup2(fd, 2);
		if (fd > 2)
			(void)close(fd);
	
		(void)write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
		     strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));
		(void)write(2,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
		     strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));
	} else if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) &&

            (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) {
		/* interactive job, single node, write to pty */
		if ((pts = open_pty(pjob)) < 0) {
			log_err(errno, id,"cannot open slave");
			starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr);
		}
		(void)dup2(pts, 1);
                (void)dup2(pts, 2);

	} else {
		/* normal batch job, single node, write straight to files */
		if (open_std_out_err(pjob) == -1) {
			starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr);
		}
	}

	log_close(0);
	starter_return(kid_write, kid_read, JOB_EXEC_OK, &sjr);
#if 0	/* def DEBUG */
	for (i=3; i< 40; ++i) {	/* check for any extra open descriptors */
		if (close(i) >= 0)
			fprintf(stderr, "Closed file %d\n", i);
	}
#endif	/* DEBUG */
	environ = vtable.v_envp;
	execvp(argv[0], argv);
	fprintf(stderr, "%s: %s\n", argv[0], strerror(errno));
	exit(254);
	return -1;	/* not reached */
}

/*
**	Free the ji_hosts and ji_vnods arrays for a job.  If any events are
**	attached to an array element, free them as well.
*/
void
nodes_free(pj)
     job	*pj;
{
	void	arrayfree	A_((	char	**array ));
	hnodent		*np;

	if (pj->ji_vnods) {
		(void)free(pj->ji_vnods);
		pj->ji_vnods = NULL;
	}

	if (pj->ji_hosts) {
	    for (np=pj->ji_hosts; np->hn_node != TM_ERROR_NODE; np++) {
		eventent	*ep = (eventent *)GET_NEXT(np->hn_events);

		if (np->hn_host)
			free(np->hn_host);
		/* don't close stream incase another job uses it */
		while (ep) {

			if (ep->ee_argv)
				arrayfree(ep->ee_argv);
			if (ep->ee_envp)
				arrayfree(ep->ee_envp);
			delete_link(&ep->ee_next);
			free(ep);
			ep = (eventent *)GET_NEXT(np->hn_events);
		}
	    }
	    free(pj->ji_hosts);
	    pj->ji_hosts = NULL;
	}
}

/*
**	Generate array hosts & vnodes for a job from the exec_host attribute.
**	Call nodes_free() just in case we have seen this job before.
**	Parse exec_host first to count the number of nodes and allocate
**	an array of nodeent's.  Then, parse it again to get the hostname
**	of each node and init the other fields of each nodeent element.
**	The final element will have the ne_node field set to TM_ERROR_NODE.
*/
void
job_nodes(pjob)
     job	*pjob;
{
	int		numhost;
	int		i, j, nhosts, nodenum;
	int		ix;
	char		*cp, *nodestr;
	hnodent		*hp;
	vnodent		*np;
	extern	char	mom_host[];

	nodes_free(pjob);
	nodenum = 1;
	if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags &
			ATR_VFLAG_SET) {

		nodestr = pjob->ji_wattr[(int)JOB_ATR_exec_host].
				at_val.at_str;
		if (nodestr) {
			for (cp = nodestr; *cp; cp++) {
				if (*cp == '+')
					nodenum++;
			}
		}
	} else
		nodestr = mom_host;


	pjob->ji_hosts = (hnodent *)calloc(nodenum+1, sizeof(hnodent));
	pjob->ji_vnods = (vnodent *)calloc(nodenum+1, sizeof(vnodent));
	assert(pjob->ji_hosts);
	assert(pjob->ji_vnods);
	pjob->ji_numvnod = nodenum;

	nhosts  = 0;
	np = pjob->ji_vnods;
	for (i=0; i<nodenum; i++, np++) {
		char		*dp, nodename[MAXPATHLEN+1];

		
		ix = 0;
		for (cp=nodestr, dp=nodename; *cp; cp++, dp++) {
			if (*cp == '/') {
				ix = atoi(cp + 1);
				while ((*cp != '\0') && (*cp != '+'))
					++cp;
				if (*cp == '\0') {
					nodestr = cp;
					break;
				}
			}
			if (*cp == '+') {
				nodestr = cp + 1;
				break;
			}
			*dp = *cp;
		}
		*dp = '\0';


		/* see if we already have this host */
		for (j=0; j<nhosts; ++j) {
		    if (strcmp(nodename, pjob->ji_hosts[j].hn_host) == 0)
			break;
		}
		hp = &pjob->ji_hosts[j];
		if (j == nhosts) {	/* need to add host to tn_host */
			hp->hn_node = nhosts++;
			hp->hn_stream = -1;
			hp->hn_sister = SISTER_OKAY;
			hp->hn_host = strdup(nodename);
			CLEAR_HEAD(hp->hn_events);
		}
		np->vn_node  = i;	/* make up node id */
		np->vn_host  = &pjob->ji_hosts[j];
		np->vn_index = ix;

		DBPRT(("job_nodes: %d: %s/%d\n",np->vn_node, np->vn_host->hn_host, np->vn_index))
	}
	np->vn_node = TM_ERROR_NODE;
	pjob->ji_hosts[nhosts].hn_node = TM_ERROR_NODE;
	pjob->ji_numnodes = nhosts;
	pjob->ji_numvnod  = nodenum;
DBPRT(("job: %s numnodes=%d numvnod=%d\n",pjob->ji_qs.ji_jobid,nhosts,nodenum))
}

/*
 * start_exec() - start execution of a job
 */
void start_exec(pjob)
	job *pjob;
{
	static char	*id = "start_exec";
	eventent	*ep;
	int		i, nodenum, len;
	int		j;
	int		k;
	int		ports[2], socks[2];
	struct	sockaddr_in	saddr;
	hnodent		*np;
	attribute	*pattr;
	list_head	phead;
	svrattrl	*psatl;
	int		stream;
	void	im_compose	A_((	int		stream,
					char		*jobid,
					char		*cookie,
					int		command,
					tm_event_t	event,
					tm_task_id	taskid));

	if ( !(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags & ATR_VFLAG_SET)) {
		char			*tt;
		extern	time_t		loopcnt;
		MD5_CTX			c;
		int			i;

		tt = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str =
			malloc(33);
		pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags |=ATR_VFLAG_SET;

		loopcnt++;
		MD5Init(&c);
		MD5Update(&c, (caddr_t)&loopcnt, sizeof(loopcnt));
		MD5Update(&c, (caddr_t)pjob, sizeof(job));
		MD5Final(&c);
		for (i=0; i<16; i++)
			sprintf(&tt[i*2], "%02X", c.digest[i]);
		DBPRT(("===== MD5 %s\n", tt))
	}

	job_nodes(pjob);
	pjob->ji_nodeid = 0;		/* I'm MS */
	nodenum = pjob->ji_numnodes;

	if (nodenum > 1) {
		pjob->ji_resources = (noderes *)calloc(nodenum-1,
						sizeof(noderes));
		assert(pjob->ji_resources != NULL);
		CLEAR_HEAD(phead);
		pattr = pjob->ji_wattr;
		for (i=0; i < (int)JOB_ATR_LAST; i++) {
			(void)(job_attr_def+i)->at_encode(pattr+i, &phead,
				(job_attr_def+i)->at_name, (char *)0,
				ATR_ENCODE_MOM);
		}
		attrl_fixlink(&phead);
/*
**		Open streams to the sisterhood.
*/
		for (i=1; i<nodenum; i++) {
			np = &pjob->ji_hosts[i];
	
			np->hn_stream = rpp_open(np->hn_host, pbs_rm_port);
			if (np->hn_stream < 0) {
				sprintf(log_buffer, "rpp_open failed on %s",
						np->hn_host);
				log_err(errno, id, log_buffer);
				exec_bail(pjob, JOB_EXEC_FAIL1);
				return;
			}
		}
/*
**		Open two sockets for use by demux program later.
*/
		for (i=0; i<2; i++)
			socks[i] = -1;
		for (i=0; i<2; i++) {
			if ((socks[i] = socket(AF_INET, SOCK_STREAM, 0)) == -1)
				break;
	
			memset(&saddr, '\0', sizeof(saddr));
			saddr.sin_addr.s_addr = INADDR_ANY;
			saddr.sin_family = AF_INET;
			if (bind(socks[i], (struct sockaddr *)&saddr,
					sizeof(saddr)) == -1)
				break;
	
			len = sizeof(saddr);
			if (getsockname(socks[i],
					(struct sockaddr *)&saddr, &len) == -1)
				break;
			ports[i] = (int)ntohs(saddr.sin_port);
		}
		if (i < 2) {
			log_err(errno, id, "stdout/err socket");
			for (i=0; i<2; i++) {
				if (socks[i] != -1)
					close(socks[i]);
			}
			exec_bail(pjob, JOB_EXEC_FAIL1);
			return;
		}
		pjob->ji_stdout = socks[0];
		pjob->ji_stderr = socks[1];
/*
**		Send out a JOIN_JOB message to all the MOM's in the sisterhood.
*/
		for (i=1; i<nodenum; i++) {
	
			np = &pjob->ji_hosts[i];
			stream = np->hn_stream;
	
			ep = event_alloc(IM_JOIN_JOB, np, TM_NULL_EVENT,
					 TM_NULL_TASK);
			im_compose(stream, pjob->ji_qs.ji_jobid,
			      pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
			      IM_JOIN_JOB, ep->ee_event, TM_NULL_TASK);
			(void)diswsi(stream, i);	/* nodeid of receiver */
			(void)diswsi(stream, nodenum);	/* number of nodes */
			(void)diswsi(stream, ports[0]);	/* out port number */
			(void)diswsi(stream, ports[1]);	/* err port number */
	
			/* write jobattrs */
			psatl = (svrattrl *)GET_NEXT(phead);
			(void)encode_DIS_svrattrl(stream, psatl);
			rpp_flush(stream);
		}
		free_attrlist(&phead);
	} else {		/* no sisters */
		ports[0] = -1;
		ports[1] = -1;
		pjob->ji_stdout = -1;
		pjob->ji_stderr = -1;
		
		finish_exec(pjob);
		LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			pjob->ji_qs.ji_jobid, log_buffer);
	}
	return;
}

/*
 * fork_me - fork mom, close all other connection and set default signal actions
 */

pid_t fork_me(conn)
	int conn;
{
	struct sigaction act;
	pid_t		 pid;

	fflush(stdout);
	fflush(stderr);
	pid = fork();
	if (pid == 0) {
		/* now the child */

		/* Turn off alarm if it should happen to be on */
		alarm(0);
		rpp_terminate();

		/* Reset signal actions for most to SIG_DFL */
		sigemptyset(&act.sa_mask);
		act.sa_flags   = 0;
		act.sa_handler = SIG_DFL;
		(void)sigaction( SIGCHLD, &act, (struct sigaction *)0);
#ifdef _CRAY
		(void)sigaction(WJSIGNAL, &act, (struct sigaction *)0);
#endif	/* _CRAY */
		(void)sigaction( SIGHUP, &act, (struct sigaction *)0);
		(void)sigaction( SIGINT, &act, (struct sigaction *)0);
		(void)sigaction( SIGTERM, &act, (struct sigaction *)0);

		/* Reset signal mask */
		(void)sigprocmask(SIG_SETMASK, &act.sa_mask, NULL);

		(void)mom_close_poll();
		net_close(conn);	/* close all but for the current */
	} else if (pid < 0)
		log_err(errno, "fork_me", "fork failed");

	return (pid);
}

/*
 * starter_return - return starter value, 
 *	exit if negative
 */

static void starter_return(upfds, downfds, code, sjrtn)
	int upfds;
	int downfds;
	int code;
	struct startjob_rtn *sjrtn;
{
	struct startjob_rtn ack;
	int i;

	sjrtn->sj_code = code;
	(void)write(upfds, (char *)sjrtn, sizeof(*sjrtn));
	(void)close(upfds);

	/* wait for acknowledgement */
	do {
		i = read(downfds, &ack, sizeof(ack));
		if ((i == -1) && (errno != EINTR))
			break;
	} while (i < 0);
	(void)close(downfds);
	if (code < 0) {
		exit(254);
	}
}
	
/*
 * std_file_name - generate the fully qualified path/name for a
 *		   job standard stream
 */

char *std_file_name(pjob, which, keeping)
	job		*pjob;
	enum job_file	 which;
	int		*keeping;	/* RETURN */
{
	static char  path[MAXPATHLEN+1];
	char  key;
	int   len;
	char *pd;
	char *suffix;

	if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) &&
	    (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) {

		/* interactive job, name of pty is in outpath */

		*keeping = 0;
		return (pjob->ji_wattr[(int)JOB_ATR_outpath].at_val.at_str);
	}

	switch (which) {
	    case StdOut:
		key    = 'o';
		suffix = JOB_STDOUT_SUFFIX;
		break;

	    case StdErr:
		key    = 'e';
		suffix = JOB_STDERR_SUFFIX;
		break;

	    case Chkpt:
		key = '\001';	/* should never be found */
		suffix = JOB_CKPT_SUFFIX;
		break;
	}

	/* Is file to be kept?, if so use default name in Home directory */

	if ((pjob->ji_wattr[(int)JOB_ATR_keep].at_flags & ATR_VFLAG_SET) &&
	    (strchr(pjob->ji_wattr[(int)JOB_ATR_keep].at_val.at_str, key))) {

		/* yes, it is to be kept */

		(void)strcpy(path, pjob->ji_grpcache->gc_homedir);
		
		pd = strrchr(pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str, '/');
		if (pd == (char *)0) {
			pd = pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str;
			(void)strcat(path, "/");
		}

		(void)strcat(path, pd);		/* start with the job name */
		len = strlen(path);
		*(path + len++) = '.';          /* the dot        */
		*(path + len++) = key;		/* the letter     */
		pd = pjob->ji_qs.ji_jobid;      /* the seq_number */
		while (isdigit((int)*pd))       
			*(path + len++) = *pd++;
		*(path + len) = '\0';
		*keeping = 1;
	} else {

		/* put into spool directory unless NO_SPOOL_OUTPUT is defined */

#ifdef NO_SPOOL_OUTPUT		/* force all output to user's HOME */
		(void)strcpy(path, pjob->ji_grpcache->gc_homedir);
		(void)strcat(path, "/");
		*keeping = 1;
#else	/* NO_SPOOL_OUTPUT */
		(void)strcpy(path, path_spool);
		*keeping = 0;
#endif	/* NO_SPOOL_OUTPUT */
		(void)strcat(path, pjob->ji_qs.ji_fileprefix);
		(void)strcat(path, suffix);
	}
	return (path);
}

/*
 * open_std_file - open either standard output or standard error for the job.
 */

int open_std_file(pjob, which, mode, exgid)
	job		*pjob;
	enum job_file	 which;		/* which file */
	int		 mode;		/* file mode */
	gid_t		 exgid;		/* gid for file */
{
	int   fds;
	int   keeping;
	char *path;

	path = std_file_name(pjob, which, &keeping);
	if (keeping) {
		/* in user's home,  may be NFS mounted, must create as user */

#if defined(HAVE_SETEUID) && defined(HAVE_SETEGID)
		/* most systems */
		if ((setegid(exgid) == -1) || 
		    (seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid) == -1))
			return -1;
		fds = open(path, mode, 0666);
		(void)seteuid(0);
		(void)setegid(pbsgroup);
#elif defined(HAVE_SETRESUID) && defined(HAVE_SETRESGID)
		/* HPUX and the like */
		if ((setresgid(-1, exgid, -1) == -1) ||
		    (setresuid(-1,pjob->ji_qs.ji_un.ji_momt.ji_exuid,-1) == -1))
			return -1;
		fds = open(path, mode, 0666);
		(void)setresuid(-1, 0, -1);
		(void)setresgid(-1, pbsgroup, -1);
#else	/* Neither */
Crash and Burn - need seteuid/setegid   or need setresuid/setresgid
#endif	/* HAVE_SETRESUID */
	} else {
		fds = open(path, mode, 0666);
		if (fds >= 0) {
			/* change file uid/gid to execution user of job  */
			(void)fchown(fds, pjob->ji_qs.ji_un.ji_momt.ji_exuid, exgid);
		}
	}
	return (fds);
}

/*
 * find_env_slot - find if the environment variable is already in the table,
 *	If so, replease existing one with new one.
 */

static int find_env_slot(ptbl, pstr)
	struct var_table *ptbl;
	char             *pstr;
{
	int	 i;
	int	 len = 1;	/* one extra for '=' */

	for (i=0; *(pstr+i) != '='; ++i)
		++len;

	for (i=0; i<ptbl->v_used; ++i) {
		if (strncmp(ptbl->v_envp[i], pstr, len) == 0)
			return (i);
	}
	return (-1);
}

/*
 * bld_env_variables - build up the array of environment variables which are
 *	passed to the job.
 *
 *	Value may be null if total string (name=value) is included in "name".
 */

void bld_env_variables(vtable, name, value)
	struct var_table *vtable;
	char		 *name;
	char		 *value;
{
	int	amt;
	int	i;

	if (vtable->v_used == vtable->v_ensize)
		return;			/* no room for pointer */

	amt = strlen(name) + 1;
	if (value)
		amt += strlen(value) + 1;	/* plus 1 for "="     */
	if (amt > vtable->v_bsize)	 	/* no room for string */
		return;

	(void)strcpy(vtable->v_block, name);
	if (value) {
		(void)strcat(vtable->v_block, "=");
		(void)strcat(vtable->v_block, value);
	}

	if ((i = find_env_slot(vtable, vtable->v_block)) < 0) {
		*(vtable->v_envp + vtable->v_used++) = vtable->v_block;
	} else {
		*(vtable->v_envp + i) = vtable->v_block;
	}
	vtable->v_block += amt;
	vtable->v_bsize -= amt;
}
	

/*                                                                    
 * init_groups - read the /etc/group file and build an array of
 *	group memberships.
 */

int init_groups(pwname, pwgrp, groupsize, groups)
	char *pwname;		/* User's name */
	int   pwgrp;		/* User's group from pw entry */
	int   groupsize;	/* size of the array, following argument */
	int  *groups;		/* ptr to group array, list build there */
{
	struct group *grp;
	int i;
	int n;

	n = 0;
	if (pwgrp != 0)
		*(groups + n++) = pwgrp;

	setgrent();

	while (grp = getgrent()) {
		if (grp->gr_gid == pwgrp)
			continue;
		
		for (i = 0; grp->gr_mem[i]; i++)
			if ( !strcmp(grp->gr_mem[i], pwname)) {
				if (n == groupsize) {
					endgrent();
					return (-1);
				}
				*(groups + n++) = grp->gr_gid;
			}
	}
	endgrent();
	return (n);
}

/*
 * catchinter = catch death of writer child and/or shell child of interactive
 *	When one dies, kill off the other; there is no mercy in this family.
 */	

static void catchinter(sig)
	int sig;
{
	int   status;
	pid_t pid;

	pid = waitpid(-1, &status, WNOHANG);
	if (pid == 0)
		return;
	if (pid == writerpid) {
		kill(shellpid, SIGKILL);
		(void)wait(&status);
	} else {
		kill(writerpid, SIGKILL);
		(void)wait(&status);
	}
	mom_reader_go = 0;
}
