/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/

#include <pbs_config.h>   /* the master config generated by configure */

#include	<assert.h>
#include	<stdio.h>
#include	<stdlib.h>
#include	<unistd.h>
#include	<dirent.h>
#include	<pwd.h>
#include	<signal.h>
#include	<string.h>
#include	<ctype.h>
#include	<errno.h>
#include	<fcntl.h>
#include	<time.h>
#include	<limits.h>
#include	<netdb.h>
#include	<sys/types.h>
#include	<sys/param.h>
#include	<sys/times.h>
#include	<sys/stat.h>
#include	<netinet/in.h>
#include	<sys/time.h>
#include	<sys/resource.h>

#include 	"libpbs.h"
#include 	"pbs_ifl.h"
#include	"list_link.h"
#include	"attribute.h"
#include	"resource.h"
#include	"server_limits.h"
#include	"job.h"
#include	"pbs_error.h"
#include	"log.h"
#include	"net_connect.h"
#include	"rpp.h"
#include	"dis.h"
#include	"dis_init.h"

static char ident[] = "@(#) $RCSfile: mom_comm.c,v $ $Revision: 2.6.2.1 $";


/* Global Data Items */

extern	int		exiting_tasks;
extern	char		mom_host[];
extern	char		*path_jobs;
extern	char		*path_home;
extern	int		pbs_errno;
extern	unsigned int	pbs_mom_port;
extern	unsigned int	pbs_rm_port;
extern	unsigned int	pbs_tm_port;
extern	list_head	mom_polljobs;	/* must have resource limits polled */
extern	list_head	svr_alljobs;	/* all jobs under MOM's control */
extern	int		termin_child;
extern	time_t		time_now;
extern	void		*okclients;	/* accept connections from */
extern	int		server_stream;

char	task_fmt[] = "/%010.10ld";
char	noglobid[] = "none";

/*
 * the following funny business is due to the fact that O_SYNC
 * is not currently POSIX
 */
#ifdef O_SYNC
#define O_Sync O_SYNC
#elif _FSYNC
#define O_Sync _FSYNC
#else
#define O_Sync 0
#endif

/*
**	Save the critical information associated with a task to disk.
*/
int
task_save(ptask)
    task	*ptask;
{
	static	char	id[] = "task_save";
	job	*pjob = ptask->ti_job;
	int	fds;
	int	i;
	char	namebuf[MAXPATHLEN];
	char	filnam[MAXPATHLEN];
	int	openflags;

	(void)strcpy(namebuf, path_jobs);      /* job directory path */
	(void)strcat(namebuf, pjob->ji_qs.ji_fileprefix);
	(void)strcat(namebuf, JOB_TASKDIR_SUFFIX);
	(void)sprintf(filnam, task_fmt, ptask->ti_qs.ti_task);
	(void)strcat(namebuf, filnam);

	openflags =  O_WRONLY | O_CREAT | O_Sync;
	fds = open(namebuf, openflags, 0600);
	if (fds < 0) {
		log_err(errno, id, "error on open");
		return (-1);
	}

	/* just write the "critical" base structure to the file */

	while ((i = write(fds, (char *)&ptask->ti_qs, sizeof(ptask->ti_qs))) !=
			sizeof(ptask->ti_qs)) {
		if ((i < 0) && (errno == EINTR)) {	/* retry the write */
			if (lseek(fds, (off_t)0, SEEK_SET) < 0) {
				log_err(errno, id, "lseek");
				(void)close(fds);
				return (-1);
			}
			continue;
		}
		else {
			log_err(errno, id, "quickwrite");
			(void)close(fds);
			return (-1);
		}
	}
	(void)close(fds);
	return (0);
}

/*
**	Allocate an event and link it to the given nodeent entry.
*/
eventent *
event_alloc(command, pnode, event, taskid)
	int		command;
	hnodent		*pnode;
	tm_event_t	event;
	tm_task_id	taskid;
{
	static	tm_event_t	eventnum = TM_NULL_EVENT+1;
	eventent		*ep;

	ep = (eventent *)malloc(sizeof(eventent));
	assert(ep);
	ep->ee_command = command;
	ep->ee_event = (event == TM_NULL_EVENT) ? eventnum++ : event;
	ep->ee_taskid = taskid;
	ep->ee_forward.fe_node = TM_ERROR_NODE;
	ep->ee_forward.fe_event = TM_ERROR_EVENT;
	ep->ee_forward.fe_taskid = TM_NULL_TASK;
	ep->ee_argv = NULL;
	ep->ee_envp = NULL;
	CLEAR_LINK(ep->ee_next);

	append_link(&pnode->hn_events, &ep->ee_next, ep);
	return ep;
}

/*
**	Create a new task if the current number is less then
**	the tasks per node limit.
*/
task	*
task_create(pjob, taskid)
     job	*pjob;
     tm_task_id	taskid;
{
	task		*ptask;
	attribute	*at;
	resource_def	*rd;
	resource	*pres;
	u_long		tasks;

	for (ptask = (task *)GET_NEXT(pjob->ji_tasks), tasks=0;
		ptask != NULL;
		ptask = (task *)GET_NEXT(ptask->ti_jobtask), tasks++);

	at = &pjob->ji_wattr[(int)JOB_ATR_resource];
	rd = find_resc_def(svr_resc_def, "taskspn", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	if (pres != NULL) {
		if (tasks >= (unsigned long)pres->rs_value.at_val.at_long)
			return NULL;
	}

	ptask = (task *)malloc(sizeof(task));
	assert(ptask);
	ptask->ti_job = pjob;
	CLEAR_LINK(ptask->ti_jobtask);
	append_link(&pjob->ji_tasks, &ptask->ti_jobtask, ptask);
	ptask->ti_fd = -1;
	ptask->ti_flags = 0;
	ptask->ti_register = TM_NULL_EVENT;
	CLEAR_HEAD(ptask->ti_obits);
	CLEAR_HEAD(ptask->ti_info);

	memset(ptask->ti_qs.ti_parentjobid, 0,
		sizeof(ptask->ti_qs.ti_parentjobid));
	ptask->ti_qs.ti_parentnode = TM_ERROR_NODE;
	ptask->ti_qs.ti_parenttask = 0;
	ptask->ti_qs.ti_task = ((taskid == TM_NULL_TASK) ?
			pjob->ji_taskid++ : taskid);
	ptask->ti_qs.ti_status =  TI_STATE_EMBRYO;
	ptask->ti_qs.ti_sid = 0;
	ptask->ti_qs.ti_exitstat = 0;
	memset(ptask->ti_qs.ti_u.ti_hold, 0, sizeof(ptask->ti_qs.ti_u.ti_hold));

	return ptask;
}

task	*
task_find(pjob, taskid)
     job	*pjob;
     tm_task_id	taskid;
{
	task			*ptask;

	for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
			ptask;
			ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
		if (ptask->ti_qs.ti_task == taskid)
			break;
	}
	return ptask;
}


task	*
task_check(pjob, taskid)
     job	*pjob;
     tm_task_id	taskid;
{
	static	char	id[] = "task_check";
	task		*ptask;

	ptask = task_find(pjob, taskid);
	if (ptask == NULL) {
		sprintf(log_buffer, "%s requesting task %ld not found",
			pjob->ji_qs.ji_jobid, taskid);
		log_err(-1, id, log_buffer);
		return NULL;
	}
	if (ptask->ti_fd < 0) {
		sprintf(log_buffer, "cannot tm_reply to %s task %ld",
			pjob->ji_qs.ji_jobid, taskid);
		log_err(-1, id, log_buffer);
		return NULL;
	}
	return ptask;
}

/*
** task_recov()
** Recover (read in) the tasks from their save files for a job.
**
**	This function is only needed upon MOM start up.
*/

int
task_recov(pjob)
	job	*pjob;
{
	static	char	id[] = "task_recov";
	int		fds;
	task		*pt;
	char		dirname[MAXPATHLEN];
	char		namebuf[MAXPATHLEN];
	DIR		*dir;
	struct	dirent	*pdirent;
	struct	taskfix	task_save;

	(void)strcpy(dirname, path_jobs);      /* job directory path */
	(void)strcat(dirname, pjob->ji_qs.ji_fileprefix);
	(void)strcat(dirname, JOB_TASKDIR_SUFFIX);

	if ((dir = opendir(dirname)) == NULL)
		return -1;

	(void)strcat(dirname, "/");
	while ((pdirent = readdir(dir)) != (struct dirent *)0) {
		if (pdirent->d_name[0] == '.')
			continue;

		(void)strcpy(namebuf, dirname);
		(void)strcat(namebuf, pdirent->d_name);

		fds = open(namebuf, O_RDONLY, 0);
		if (fds < 0) {
			log_err(errno, id, "open of task file");
			unlink(namebuf);
			continue;
		}

		/* read in task quick save sub-structure */
		if (read(fds, (char *)&task_save, sizeof(task_save)) !=
				sizeof(task_save)) {
			log_err(errno, id, "read");
			unlink(namebuf);
			(void)close(fds);
			continue;
		}
		if ((pt = task_create(pjob, TM_NULL_TASK)) == NULL) {
			unlink(namebuf);
			(void)close(fds);
			continue;
		}
		pt->ti_qs = task_save;
		(void)close(fds);
	}
	(void)closedir(dir);
	return 0;
}

/*
**	Send a reply message to a user proc over a TCP stream.
*/
int
tm_reply(stream, com, event)
     int	stream;
     int	com;
     tm_event_t	event;
{
	int     ret;

	DIS_tcp_funcs();

	ret = diswsi(stream, TM_PROTOCOL);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, TM_PROTOCOL_VER);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, com);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, event);
	if (ret != DIS_SUCCESS)
		goto done;
	return DIS_SUCCESS;

 done:
	DBPRT(("tm_reply: send error %s\n", dis_emsg[ret]))
	return ret;
}

/*
**	Start a standard inter-MOM message.
*/
int
im_compose(stream, jobid, cookie, command, event, taskid)
     int	stream;
     char	*jobid;
     char	*cookie;
     int	command;
     tm_event_t	event;
     tm_task_id	taskid;
{
	int	ret;

	if (stream < 0)
		return DIS_EOF;
	DIS_rpp_reset();

	ret = diswsi(stream, IM_PROTOCOL);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, IM_PROTOCOL_VER);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswst(stream, jobid);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswst(stream, cookie);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, command);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, event);
	if (ret != DIS_SUCCESS)
		goto done;
	ret = diswsi(stream, taskid);
	if (ret != DIS_SUCCESS)
		goto done;
	return DIS_SUCCESS;

 done:
	DBPRT(("im_compose: send error %s\n", dis_emsg[ret]))
	return ret;
}

/*
**	Send a message (command = com) to all the other MOMs in
**	the job -> pjob.
*/
int
send_sisters(pjob, com)
    job		*pjob;
    int		com;
{
	int		i, num, ret;
	eventent	*ep;
	char		*cookie;

	DBPRT(("send_sisters: command %d\n", com))
	if ( !(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags & ATR_VFLAG_SET))
		return 0;
	cookie = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str;
	num = 0;
	for (i=0; i<pjob->ji_numnodes; i++) {
		hnodent		*np = &pjob->ji_hosts[i];

		if (np->hn_node == pjob->ji_nodeid)	/* this is me */
			continue;
		if (np->hn_sister != SISTER_OKAY)	/* sis is gone? */
			continue;
		if (np->hn_stream == -1)
			np->hn_stream = rpp_open(np->hn_host, pbs_rm_port);
		ep = event_alloc(com, np, TM_NULL_EVENT, TM_NULL_TASK);

		np->hn_sister = SISTER_EOF;
		ret = im_compose(np->hn_stream, pjob->ji_qs.ji_jobid,
				cookie, com, ep->ee_event, TM_NULL_TASK);
		if (ret != DIS_SUCCESS)
			continue;
		ret = rpp_flush(np->hn_stream);
		if (ret == -1)
			continue;
		np->hn_sister = SISTER_OKAY;
		num++;
	}
	return num;
}

#define	SEND_ERR(err) \
if (reply) { \
	(void)im_compose(stream, jobid, cookie, IM_ERROR, event, fromtask); \
	(void)diswsi(stream, err); \
}

/*
** Check to see which node a stream is coming from.  Return a NULL
** if it is not assigned to this job.  Return a nodeent pointer if
** it is.
*/
hnodent	*
find_node(pjob, stream, nodeid)
    job		*pjob;
    int		stream;
    tm_node_id	nodeid;
{
	static	char		id[] = "find_node";
	int			i;
	vnodent			*vp;
	hnodent			*hp;

	for (vp=pjob->ji_vnods, i=0; i<pjob->ji_numvnod; vp++, i++) {
		if (vp->vn_node == nodeid)
			break;
	}
	if (i == pjob->ji_numvnod) {
		sprintf(log_buffer, "node %d not found", nodeid);
		log_err(-1, id, log_buffer);
		return NULL;
	}
	hp = vp->vn_host;
	if (stream == -1) {
		/*
		** If node is not me and no stream open, open one
		*/
		if (pjob->ji_nodeid != hp->hn_node && hp->hn_stream == -1)
			hp->hn_stream = rpp_open(hp->hn_host, pbs_rm_port);
	}
	else if (hp->hn_stream == -1) {
		/*
		** No stream recorded in the node info, save this one.
		*/
		hp->hn_stream = stream;
	}
	else if (hp->hn_stream != stream) {
		struct  sockaddr_in     *stream_addr;
		struct  sockaddr_in     *node_addr;

		/*
		** The node struct has a different stream number saved
		** then the one passed in (supposedly from the same node).
		** Check to see if stream recorded in the node struct
		** and the one passed in have the same IP address.  If
		** they do (only a possibly different port number),
		** we are fine.  Otherwise, a mixup has happened.
		*/
		stream_addr = rpp_getaddr(stream);
		node_addr = rpp_getaddr(hp->hn_stream);

		if (memcmp(&stream_addr->sin_addr, &node_addr->sin_addr,
				sizeof(node_addr->sin_addr)) != 0) {
			sprintf(log_buffer,
				"stream id %d does not match %d to node %d",
				stream, hp->hn_stream, nodeid);
			log_err(-1, id, log_buffer);

			sprintf(log_buffer, "%s: stream addr %s\n", id,
				netaddr(stream_addr));
			log_err(-1, id, log_buffer);

			sprintf(log_buffer, "%s: node addr %s\n", id,
				netaddr(node_addr));
			log_err(-1, id, log_buffer);
			return NULL;
		}
	}
	return hp;
}

/*
** An error has been encountered starting a job.
** Format a message to all the sisterhood to get rid of their copy
** of the job.  There should be no processes running at this point.
*/
void
job_start_error(pjob, code, nodename)
    job		*pjob;
    int		code;
    char  	*nodename;
{
	static  char    id[] = "job_start_error";
	int	nodes;

	(void)sprintf(log_buffer, "job_start_error from node %s", nodename);
	log_err(code, log_buffer, pjob->ji_qs.ji_jobid);
	nodes = send_sisters(pjob, IM_ABORT_JOB);
	if (nodes != pjob->ji_numnodes-1) {
		sprintf(log_buffer,
			"%s: sent %d ABORT requests, should be %d",
			id, nodes, pjob->ji_numnodes-1);
		log_err(-1, id, log_buffer);
	}
	pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
	pjob->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_RETRY;
	exiting_tasks = 1;

	return;
}

/*
** Free malloc'ed array (used in SPAWN)
*/
void
arrayfree(array)
    char	**array;
{
	int	i;

	for (i=0; array[i]; i++)
		free(array[i]);
	free(array);
}

/*
**	Deal with events hooked to a node where a stream has gone
**	south or we are going away.
*/
void
node_bailout(pjob, np)
	job			*pjob;
	hnodent			*np;
{
	static	char	id[] = "node_bailout";
	task		*ptask;
	eventent	*ep;
	int		i;

	ep = (eventent *)GET_NEXT(np->hn_events);
	while (ep) {
		switch (ep->ee_command) {

		case	IM_JOIN_JOB:
			/*
			** I'm MS and a node has failed to respond to the
			** call.  Maybe in the future the use can specify
			** the job can start with a range of nodes so
			** one (or more) missing can be tolerated.  Not
			** for now.
			*/
			DBPRT(("%s: JOIN_JOB %s\n", id, pjob->ji_qs.ji_jobid))
			job_start_error(pjob, PBSE_SISCOMM, np->hn_host);
			break;

		case	IM_ABORT_JOB:
		case	IM_KILL_JOB:
			/*
			** The job is already in the process of being killed
			** but somebody has dropped off the face of the
			** earth.  Just check to see if everybody has
			** been heard from in some form or another and
			** set JOB_SUBSTATE_EXITING if so.
			*/
			DBPRT(("%s: KILL/ABORT JOB %s\n",
				id, pjob->ji_qs.ji_jobid))
			for (i=1; i<pjob->ji_numnodes; i++) {
				if (pjob->ji_hosts[i].hn_sister == SISTER_OKAY)
					break;
			}
			if (i == pjob->ji_numnodes) {	/* all dead */
				pjob->ji_qs.ji_substate =
					JOB_SUBSTATE_EXITING;
				exiting_tasks = 1;
			}
			break;
			
		case	IM_SPAWN_TASK:
		case	IM_GET_TASKS:
		case	IM_SIGNAL_TASK:
		case	IM_OBIT_TASK:
		case	IM_GET_INFO:
		case	IM_GET_RESC:
			/*
			** A user attemt failed, inform process.
			*/
			DBPRT(("%s: REQUEST %d %s\n", id,
				ep->ee_command, pjob->ji_qs.ji_jobid))

			ptask = task_check(pjob, ep->ee_taskid);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_ERROR, ep->ee_event);
			(void)diswsi(ptask->ti_fd, TM_ESYSTEM);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_POLL_JOB:
			/*
			** I must be Mother Superior for the job and
			** this is an error reply to a poll request.
			*/
			sprintf(log_buffer, "%s POLL failed from node %d",
				pjob->ji_qs.ji_jobid, np->hn_node);
			log_err(-1, id, log_buffer);
			pjob->ji_nodekill = np->hn_node;
			break;

		case	IM_GET_TID:
			/*
			** A request to Mother Superior to get
			** a TID has failed.
			*/
			DBPRT(("%s: GET_TID %s\n", id, pjob->ji_qs.ji_jobid))
			arrayfree(ep->ee_argv);
			arrayfree(ep->ee_envp);

			ptask = task_check(pjob, ep->ee_forward.fe_taskid);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_ERROR,
					ep->ee_forward.fe_event);
			(void)diswsi(ptask->ti_fd, TM_ESYSTEM);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		default:
			sprintf(log_buffer, "unknown command %d saved", 
					ep->ee_command);
			log_err(-1, id, log_buffer);
			break;
		}

		delete_link(&ep->ee_next);
		free(ep);
		ep = (eventent *)GET_NEXT(np->hn_events);
	}
}

void
term_job(pjob)
	job			*pjob;
{
	hnodent			*np;
	int			num;

	for (num=0, np = pjob->ji_hosts;
			num<pjob->ji_numnodes;
			num++, np++) {
		if (np->hn_stream >= 0) {
			rpp_close(np->hn_stream);
			np->hn_stream = -1;
			np->hn_sister = SISTER_EOF;
		}
		node_bailout(pjob, np);
	}
}

/*
**	Handle a stream that needs to be closed.
**	May be either from another Mom, or the server.
*/
void
im_eof(stream, ret)
    int		stream;
    int		ret;
{
	static	char	id[] = "im_eof";
	int			num;
	job			*pjob;
	hnodent			*np;
	struct	sockaddr_in	*addr;

	addr = rpp_getaddr(stream);
	sprintf(log_buffer,
		"%s from addr %s", dis_emsg[ret], netaddr(addr));
	log_err(-1, id, log_buffer);
	rpp_close(stream);

	if (stream == server_stream) {
		server_stream = -1;
		return;
	}

	/*
	** Search though all the jobs looking for this stream.
	** We want to find if any events are being waited for
	** from the "dead" stream and do something with them.
	*/
	for (pjob = (job *)GET_NEXT(svr_alljobs);
			pjob != NULL;
			pjob = (job *)GET_NEXT(pjob->ji_alljobs)) {
		for (num=0, np = pjob->ji_hosts;
				num<pjob->ji_numnodes;
				num++, np++) {
			if (np->hn_stream == stream) {
				np->hn_stream = -1;
				np->hn_sister = SISTER_EOF;
				break;
			}
		}
		if (num < pjob->ji_numnodes)	/* found it */
			break;
	}
	if (pjob == NULL)
		return;

	node_bailout(pjob, np);

	/*
	** If dead stream is num = 0, I'm a regular node and my connection to
	** Mother Superior is gone... kill job.
	*/
	if (num == 0) {
		sprintf(log_buffer, "job %s lost connection to MS on %s",
			pjob->ji_qs.ji_jobid, np->hn_host);
		log_err(-1, id, log_buffer);
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
		exiting_tasks = 1;
	}
}

/*
** Check to be sure this is a connection from Mother Superior on
** a good port.
** Check to make sure I am not Mother Superior (talking to myself).
** Set the stream in ji_nodes[0] if needed.
** Return TRUE on error, FALSE if okay.
*/
int
check_ms(stream, pjob)
    int		stream;
    job		*pjob;
{
	static	char	id[] = "check_ms";
	struct	sockaddr_in	*addr;
	hnodent			*np;

	addr = rpp_getaddr(stream);
	if (ntohs(addr->sin_port) >= IPPORT_RESERVED) {
		sprintf(log_buffer,
			"non-privilaged connection from %s", netaddr(addr));
		log_err(-1, id, log_buffer);
		rpp_close(stream);
		return TRUE;
	}
	if (pjob == NULL)
		return FALSE;

	if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) {
		log_err(-1, id, "Mother Superior talking to herself");
		rpp_eom(stream);
		return TRUE;
	}

	/*
	** This should be mother superior calling.
	** We always have a stream open to her at node 0.
	*/
	np = &pjob->ji_hosts[0];		/* MS entry */
	if (stream != np->hn_stream) {
		if (np->hn_stream != -1) {
			sprintf(log_buffer,
				"MS reset from %d to %d (%s)",
				np->hn_stream, stream, netaddr(addr));
			log_err(-1, id, log_buffer);
		}
		np->hn_stream = stream;
	}
	return FALSE;
}

u_long
resc_used(pjob, name, func)
    job		*pjob;
    char	*name;
    u_long	(*func) A_((resource *pres));
{
	attribute		*at;
	resource_def		*rd;
	resource		*pres;
	u_long	val = 0L;

	at = &pjob->ji_wattr[(int)JOB_ATR_resc_used];
	if (at == NULL)
		return 0;

	rd = find_resc_def(svr_resc_def, name, svr_resc_size);
	if (rd == NULL)
		return 0;

	pres = find_resc_entry(at, rd);
	if (pres == NULL)
		return 0;

	val = func(pres);
	DBPRT(("resc_used: %s %lu\n", name, val))
	return val;
}

/*
**	Find named info for a task.
*/
infoent	*
task_findinfo(ptask, name)
    task	*ptask;
    char	*name;
{
	infoent		*ip;

	for (ip = (infoent *)GET_NEXT(ptask->ti_info);
			ip;
			ip = (infoent *)GET_NEXT(ip->ie_next)) {
		if (strcmp(ip->ie_name, name) == 0)
			break;
	}
	return ip;
}

/*
**	Save named info with a task.
*/
void
task_saveinfo(ptask, name, info, len)
    task	*ptask;
    char	*name;
    void	*info;
    int		len;
{
	infoent		*ip;

	if ((ip = task_findinfo(ptask, name)) == NULL) {	/* new name */
		ip = (infoent *)malloc(sizeof(infoent));
		assert(ip);
		CLEAR_LINK(ip->ie_next);
		append_link(&ptask->ti_info, &ip->ie_next, ip);
		ip->ie_name = name;
	}
	else				/* replace name with new info */
		free(ip->ie_info);

	ip->ie_info = info;
	ip->ie_len = len;
}


/*
**	Generate a resource string for a job.
*/
char	*
resc_string(pjob)
    job		*pjob;
{
	attribute		*at;
	attribute_def		*ad;
	svrattrl		*pal;
	list_head		lhead;
	int			len, used, tot;
	char			*res_str, *ch;
	char			*getuname();
	extern	int		resc_access_perm;

	ch = getuname();
	len = strlen(ch);
	tot = len*2;
	used = 0;
	res_str = (char *)malloc(tot);
	strcpy(res_str, ch);
	used += len;
	res_str[used++] = ':';

	at = &pjob->ji_wattr[(int)JOB_ATR_resource];
	if (at->at_type != ATR_TYPE_RESC) {
		res_str[used] = '\0';
		return res_str;
	}
	ad = &job_attr_def[(int)JOB_ATR_resource];
	resc_access_perm = ATR_DFLAG_USRD;
	CLEAR_HEAD(lhead);
	(void)ad->at_encode(at,
			&lhead, ad->at_name,
			NULL, ATR_ENCODE_CLIENT);
	attrl_fixlink(&lhead);

	for (pal = (svrattrl *)GET_NEXT(lhead);
			pal;
			pal = (svrattrl *)GET_NEXT(pal->al_link)) {
		while (used + pal->al_rescln + pal->al_valln > tot) {
			tot *= 2;
			res_str = realloc(res_str, tot);
		}
		strcpy(&res_str[used], pal->al_resc);
		used += (pal->al_rescln - 1);
		res_str[used++] = '=';
		strcpy(&res_str[used], pal->al_value);
		used += (pal->al_valln - 1);
		res_str[used++] = ',';
	}
	free_attrlist(&lhead);
	res_str[--used] = '\0';
	return res_str;
}

/*
**	Input is coming from another MOM over a DIS rpp stream.
**	Read the stream to get a Inter-MOM request.
**
**	request (
**		jobid			string
**		cookie			string
**		command			int
**		event			int
**		task			int
**	)
**
**	Format the reply and write it back.
*/
void
im_request(stream, version)
     int	stream;
     int	version;
{
	char			*id = "im_request";
	int			command = 0;
	int			event_com, ret;
	char			*jobid = NULL;
	char			*cookie = NULL;
	char			*oreo;
	char			basename[50];
	char			namebuf[MAXPATHLEN];
	char			*nodestr;
	job			*pjob;
	task			*ptask;
	hnodent			*np;
	eventent		*ep;
	infoent			*ip;
	struct	sockaddr_in	*addr;
	u_long			ipaddr;
	int			i, errcode, nodeidx;
	int			reply;
	int			exitval;
	tm_node_id		nodeid;
	tm_task_id		fromtask, event_task, taskid;
	int			nodenum, index;
	int			num;
	int			sig;
	char			**argv, **envp, *cp, *globid;
	char			*name;
	void			*info;
	size_t			len;
	tm_event_t		event;
	fwdevent		efwd;
	list_head		lhead;
	svrattrl		*psatl;
	attribute_def		*pdef;
	struct	passwd		*check_pwd();
	extern	int		resc_access_perm;
	int	start_process	A_((task *pt, char **argv, char **envp));
	u_long	gettime		A_((resource *pres));
	u_long	getsize		A_((resource *pres));

	DBPRT(("%s: stream %d version %d\n", id, stream, version))
	if (version != IM_PROTOCOL_VER) {
		sprintf(log_buffer, "protocol version %d unknown\n", version);
		log_err(-1, id, log_buffer);
		rpp_close(stream);
		return;
	}

	/* check that machine is known */
	addr = rpp_getaddr(stream);
	ipaddr = ntohl(addr->sin_addr.s_addr);
	DBPRT(("connect from %s\n", netaddr(addr)))
	if (!tfind(ipaddr, &okclients)) {
		sprintf(log_buffer, "bad connect from %s",
			netaddr(addr));
		log_err(-1, id, log_buffer);
		rpp_close(stream);
		return;
	}

	jobid = disrst(stream, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	cookie = disrst(stream, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	command = disrsi(stream, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	event = disrsi(stream, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	fromtask = disrsi(stream, &ret);
	if (ret != DIS_SUCCESS)
		goto err;

	switch (command) {

	case IM_JOIN_JOB:
		/*
		** Sender is mom superior sending a job structure to me.
		** I am going to become a member of a job.
		**
		** auxiliary info (
		**	localnode id	int;
		**	number of nodes	int;
		**	stdout port	int;
		**	stderr port	int;
		**	nodeid 0	int;
		**	...
		**	nodeid n-1	int;
		**	jobattrs	attrl;
		** )
		*/
		reply = 1;
		if (check_ms(stream, NULL))
			goto fini;
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		nodenum = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;

		DBPRT(("%s: JOIN_JOB %s node %d\n", id, jobid, nodeid))
		np = NULL;
		/* does job already exist? */
		pjob = find_job(jobid);
		if (pjob) {	/* job is here */
			SEND_ERR(PBSE_JOBEXIST)
			goto done;
		}
		if ((pjob = job_alloc()) == (job *)0) {
			SEND_ERR(PBSE_SYSTEM)
			goto done;
		}

		pjob->ji_stdout = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		pjob->ji_stderr = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;

		pjob->ji_numnodes = nodenum;	 /* XXX */
		CLEAR_HEAD(lhead);
		if (decode_DIS_svrattrl(stream, &lhead) != DIS_SUCCESS)
			goto err;
		/*
		** Get the hashname from the attribute.
		*/
		psatl = (svrattrl *)GET_NEXT(lhead);
		while (psatl) {
			if (!strcmp(psatl->al_name, ATTR_hashname)) {
				(void)strcpy(basename, psatl->al_value);
				break;
                	}
			psatl = (svrattrl *)GET_NEXT(psatl->al_link);
		}
		(void)strcpy(pjob->ji_qs.ji_jobid, jobid);
		(void)strcpy(pjob->ji_qs.ji_fileprefix, basename);
		pjob->ji_modified = 1;
		pjob->ji_nodeid = nodeid;
		pjob->ji_qs.ji_svrflags = 0;
		pjob->ji_qs.ji_un_type  = JOB_UNION_TYPE_MOM;

		/* decode attributes from request into job structure */
		errcode = 0;
		resc_access_perm = READ_WRITE;
		for (psatl = (svrattrl *)GET_NEXT(lhead);
				psatl;
				psatl = (svrattrl *)GET_NEXT(psatl->al_link)) {

			/* identify the attribute by name */
			index = find_attr(job_attr_def, psatl->al_name,
					JOB_ATR_LAST);
			if (index < 0) {	/* didn`t recognize the name */
				errcode = PBSE_NOATTR;
				break;
			}
			pdef = &job_attr_def[index];

			/* decode attribute */
			errcode = pdef->at_decode(&pjob->ji_wattr[index],
					psatl->al_name, psatl->al_resc,
					psatl->al_value);
			if (errcode != 0)
				break;
		}
		free_attrlist(&lhead);
		if (errcode != 0) {
			(void)job_purge(pjob);
			SEND_ERR(errcode)
			goto done;
		}

		job_nodes(pjob);

		/* set remaining job structure elements */
		pjob->ji_qs.ji_state =    JOB_STATE_TRANSIT;
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;
		pjob->ji_qs.ji_stime = time_now;
		pjob->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long =
				(long)time_now;
		pjob->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;
		pjob->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
		pjob->ji_qs.ji_un.ji_newt.ji_fromsock = -1;
		pjob->ji_qs.ji_un.ji_newt.ji_fromaddr = addr->sin_addr.s_addr;
		pjob->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

		if (check_pwd(pjob) == (struct passwd *)0) {
			LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			          pjob->ji_qs.ji_jobid, log_buffer);
			(void)job_purge(pjob);
			SEND_ERR(PBSE_BADUSER)
			goto done;
		}
#if IBM_SP2==2        /* IBM SP with PSSP 3.1 */
		if (load_sp_switch(pjob) != 0) {
			(void)job_purge(pjob);
			SEND_ERR(PBSE_SYSTEM)
			goto done;
		}
#endif				/* IBM SP */
		(void)job_save(pjob, SAVEJOB_FULL);
		(void)strcpy(namebuf, path_jobs);      /* job directory path */
		(void)strcat(namebuf, pjob->ji_qs.ji_fileprefix);
		(void)strcat(namebuf, JOB_TASKDIR_SUFFIX);
		if (mkdir(namebuf, 0700) == -1) {
			(void)job_purge(pjob);
			SEND_ERR(PBSE_SYSTEM)
			goto done;
		}
		sprintf(log_buffer, "JOIN JOB as node %d", nodeid);
		log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			jobid, log_buffer);
		/*
		** if certain resource limits require that the job usage be
		** polled, we link the job to mom_polljobs.
		**
		** NOTE: we overload the job field ji_jobque for this as it
		** is not used otherwise by MOM
	 	*/
		if ( mom_do_poll(pjob) )
			append_link(&mom_polljobs, &pjob->ji_jobque, pjob);
		append_link(&svr_alljobs, &pjob->ji_alljobs, pjob);

		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		goto done;

	case IM_ALL_OKAY:
	case IM_ERROR:
		reply = 0;
		break;

	default:
		reply = 1;
		break;
	}

	np = NULL;
	/*
	** Check if job already exists.
	*/
	if ((pjob = find_job(jobid)) == (job *)0) {
		SEND_ERR(PBSE_JOBEXIST)
		goto done;
	}

	/* check cookie */
	if ( !(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags & ATR_VFLAG_SET)) {
		DBPRT(("%s: job %s has no cookie", id, jobid))
		SEND_ERR(PBSE_BADSTATE)
		goto done;
	}
	oreo = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str;
	if (strcmp(oreo, cookie) != 0) {
		DBPRT(("%s: job %s cookie %s message %s", id,
			jobid, oreo, cookie))
		SEND_ERR(PBSE_BADSTATE)
		goto done;
	}
	/*
	** This is some processing needed that is common between
	** both kinds of reply.
	*/
	if (reply == 0) {
		for (nodeidx=0; nodeidx<pjob->ji_numnodes; nodeidx++) {
			np = &pjob->ji_hosts[nodeidx];

			if (np->hn_stream == stream)
				break;
		}
		if (nodeidx == pjob->ji_numnodes) {
			sprintf(log_buffer, "stream %d not found", stream);
			log_err(-1, id, log_buffer);
			goto err;
		}
		ep = (eventent *)GET_NEXT(np->hn_events);
		while (ep) {
			if (ep->ee_event == event &&
					ep->ee_taskid == fromtask)
				break;
			ep = (eventent *)GET_NEXT(ep->ee_next);
		}
		if (ep == NULL) {
			sprintf(log_buffer, "event %d taskid %ld not found",
					event, fromtask);
			log_err(-1, id, log_buffer);
			goto err;
		}

		efwd = ep->ee_forward;
		event_com = ep->ee_command;
		event_task = ep->ee_taskid;
		argv = ep->ee_argv;
		envp = ep->ee_envp;
		delete_link(&ep->ee_next);
		free(ep);
	}

	switch (command) {

	case	IM_KILL_JOB:
		/*
		** Sender is (must be) mom superior commanding me to kill a
		** job which I should be a part of.
		** Send a signal and set the jobstate to begin the
		** kill.  We wait for all tasks to exit before sending
		** an obit to mother superior.
		**
		** auxiliary info (
		**	none;
		** )
		*/
		if (check_ms(stream, pjob))
			goto fini;
		LOG_EVENT(PBSEVENT_DEBUG2, PBS_EVENTCLASS_JOB,
			  pjob->ji_qs.ji_jobid, "kill_job received");
		/*
		** Send the jobs a signal but we have to wait to
		** do a reply to mother superior until the procs
		** die and are reaped.
		*/
		DBPRT(("%s: KILL_JOB %s\n", id, jobid))
		reply = 0;
		kill_job(pjob, SIGKILL);
		pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
		pjob->ji_obit = event;
		exiting_tasks = 1;
		break;

	case	IM_SPAWN_TASK:
		/*
		** Sender is a MOM in a job that wants to start a task.
		** I am MOM on the node that is to run the task.
		**
		** auxiliary info (
		**	parent node	tm_node_id
		**	task id		tm_task_id
		**	global id	string
		**	argv 0		string
		**	...
		**	argv n		string
		**	null
		**	envp 0		string
		**	...
		**	envp m		string
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}
		taskid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		globid = disrst(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: SPAWN_TASK %s parent %d taskid %ld globid %s\n",
			id, jobid, nodeid, taskid, globid))

		if (pjob->ji_globid == NULL)
			pjob->ji_globid = globid;
		else if (strcmp(pjob->ji_globid, noglobid) == 0) {
			free(pjob->ji_globid);
			pjob->ji_globid = globid;
		}
		else if (strcmp(pjob->ji_globid, globid) != 0) {
			DBPRT(("%s: globid job %s received %s\n", id,
				pjob->ji_globid, globid))
			free(globid);
		}

		num = 4;
		argv = (char **)calloc(sizeof(char **), num);
		assert(argv);
		for (i=0;; i++) {
			if ((cp = disrst(stream, &ret)) == NULL)
				break;
			if (ret != DIS_SUCCESS)
				break;
			if (*cp == '\0') {
				free(cp);
				break;
			}
			if (i == num-1) {
				num *= 2;
				argv = (char **)realloc(argv,
						num*sizeof(char **));
				assert(argv);
			}
			argv[i] = cp;
		}
		argv[i] = NULL;
		if (ret != DIS_SUCCESS) {
			arrayfree(argv);
			goto err;
		}

		num = 8;
		envp = (char **)calloc(sizeof(char **), num);
		assert(envp);
		for (i=0;; i++) {
			if ((cp = disrst(stream, &ret)) == NULL)
				break;
			if (ret != DIS_SUCCESS)
				break;
			if (*cp == '\0') {
				free(cp);
				break;
			}
			if (i == num-1) {
				num *= 2;
				envp = (char **)realloc(envp,
						num*sizeof(char **));
				assert(envp);
			}
			envp[i] = cp;
		}
		envp[i] = NULL;
		if (ret != DIS_EOD) {
			arrayfree(argv);
			arrayfree(envp);
			goto err;
		}
		/*
		** do the spawn
		*/
		ret = DIS_SUCCESS;
		if ((ptask = task_create(pjob, taskid)) == NULL) {
			SEND_ERR(PBSE_SYSTEM);
			arrayfree(argv);
			arrayfree(envp);
			break;
		}
		strcpy(ptask->ti_qs.ti_parentjobid, jobid);
		ptask->ti_qs.ti_parentnode = nodeid;
		ptask->ti_qs.ti_parenttask = fromtask;
		if (task_save(ptask) == -1) {
			SEND_ERR(PBSE_SYSTEM)
			arrayfree(argv);
			arrayfree(envp);
			break;
		}
		if (start_process(ptask, argv, envp) == -1) {
			SEND_ERR(PBSE_SYSTEM)
		}
		else {
			ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
					event, fromtask);
			if (ret != DIS_SUCCESS)
				break;
			ret = diswsi(stream, ptask->ti_qs.ti_task);
		}

		arrayfree(argv);
		arrayfree(envp);
		break;

	case	IM_GET_TASKS:
		/*
		** Sender is MOM which controls a task that wants to get
		** the list of tasks running here.
		**
		** auxiliary info (
		**	sending node	tm_node_id;
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: GET_TASKS %s from node %d\n", id, jobid, nodeid))
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}

		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		if (ret != DIS_SUCCESS)
			break;
		for (ptask=(task *)GET_NEXT(pjob->ji_tasks);
				ptask;
				ptask=(task *)GET_NEXT(ptask->ti_jobtask)) {
			ret = diswsi(stream, ptask->ti_qs.ti_task);
			if (ret != DIS_SUCCESS)
				break;
		}
		break;
		
	case	IM_SIGNAL_TASK:
		/*
		** Sender is MOM sending a task and signal to
		** deliver.
		**
		** auxiliary info (
		**	sending node	tm_node_id;
		**	taskid		tm_task_id;
		**	signal		int;
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}
		taskid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		sig = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: SIGNAL_TASK %s from node %d task %ld signal %d\n",
			id, jobid, nodeid, taskid, sig))
		ptask = task_find(pjob, taskid);
		if (ptask == NULL) {
			SEND_ERR(PBSE_JOBEXIST)
			break;
		}
		kill_task(ptask, sig);
		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		break;

	case	IM_OBIT_TASK:
		/*
		** Sender is MOM sending a request to monitor a
		** task for exit.
		**
		** auxiliary info (
		**	sending node	tm_node_id;
		**	taskid		tm_task_id;
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}
		taskid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		ptask = task_find(pjob, taskid);
		if (ptask == NULL) {
			SEND_ERR(PBSE_JOBEXIST)
			break;
		}
		DBPRT(("%s: OBIT_TASK %s from node %d task %ld\n", id,
			jobid, nodeid, taskid))
		if (ptask->ti_qs.ti_status >= TI_STATE_EXITED) {
			ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
					event, fromtask);
			if (ret != DIS_SUCCESS)
				break;
			ret = diswsi(stream, ptask->ti_qs.ti_exitstat);
		}
		else {	/* save obit request with task */
			obitent	*op = (obitent *)malloc(sizeof(obitent));
			assert(op);
			CLEAR_LINK(op->oe_next);
			append_link(&ptask->ti_obits, &op->oe_next, op);
			op->oe_info.fe_node = nodeid;
			op->oe_info.fe_event = event;
			op->oe_info.fe_taskid = fromtask;
			reply = 0;
		}
		break;

	case	IM_GET_INFO:
		/*
		** Sender is MOM sending a task and name to lookup
		** for info to report back.
		**
		** auxiliary info (
		**	sending node	tm_node_id;
		**	taskid		tm_task_id;
		**	name		string;
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}
		taskid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		ptask = task_find(pjob, taskid);
		if (ptask == NULL) {
			SEND_ERR(PBSE_JOBEXIST)
			break;
		}
		name = disrst(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: GET_INFO %s from node %d task %ld name %s\n",
			id, jobid, nodeid, taskid, name))
		if ((ip = task_findinfo(ptask, name)) == NULL) {
			SEND_ERR(PBSE_JOBEXIST)
			break;
		}
		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		if (ret != DIS_SUCCESS)
			break;
		ret = diswcs(stream, ip->ie_info, ip->ie_len);
		break;

	case	IM_GET_RESC:
		/*
		** Sender is MOM requesting resource info to
		** report back its client.
		**
		** auxiliary info (
		**	sending node	tm_node_id;
		** )
		*/
		nodeid = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		if ((np = find_node(pjob, stream, nodeid)) == NULL) {
			SEND_ERR(PBSE_BADHOST)
			break;
		}
		DBPRT(("%s: GET_RESC %s from node %d\n", id, jobid, nodeid))
		info = resc_string(pjob);
		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		if (ret != DIS_SUCCESS)
			break;
		ret = diswst(stream, info);
		free(info);
		break;

	case	IM_POLL_JOB:
		/*
		** Sender is (must be) mom superior commanding me to send
		** information for a job which I should be a part of.
		**
		** auxiliary info (
		**	none;
		** )
		*/
		if (check_ms(stream, pjob))
			goto fini;
		DBPRT(("%s: POLL_JOB %s\n", id, jobid))
		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		if (ret != DIS_SUCCESS)
			break;
		/*
		** Now comes a recomendation for killing the job.
		*/
		exitval = (pjob->ji_qs.ji_svrflags &
			(JOB_SVFLG_OVERLMT1|JOB_SVFLG_OVERLMT2)) ? 1 : 0;
		ret = diswsi(stream, exitval);
		if (ret != DIS_SUCCESS)
			break;
		/*
		** Send the information tallyed for the job.
		*/
		ret = diswul(stream, resc_used(pjob, "cput", gettime));
		if (ret != DIS_SUCCESS)
			break;
		ret = diswul(stream, resc_used(pjob, "mem", getsize));
		break;

	case	IM_ABORT_JOB:
		/*
		** Sender is (must be) mom superior commanding me to
		** abort a JOIN_JOB request.
		**
		** auxiliary info (
		**	none;
		** )
		*/
		if (check_ms(stream, pjob))
			goto fini;
		DBPRT(("%s: ABORT_JOB %s\n", id, jobid))
		reply = 0;
		(void)job_purge(pjob);
		break;

	case	IM_GET_TID:
		/*
		** I must be mom superior getting a request from a
		** sub-mom to get a TID.
		**
		** auxiliary info (
		**	none;
		** )
		*/
		if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
			log_err(-1, id, "got GET_TID and I'm not MS");
			goto err;
		}
		DBPRT(("%s: GET_TID %s\n", id, jobid))
		ret = im_compose(stream, jobid, cookie, IM_ALL_OKAY,
				event, fromtask);
		if (ret != DIS_SUCCESS)
			break;
		ret = diswsi(stream, pjob->ji_taskid++);
		break;

	case	IM_ALL_OKAY:		/* this is a REPLY */
		/*
		** Sender is another MOM telling me that a request has
		** completed just dandy.
		*/
		switch (event_com) {

		case	IM_JOIN_JOB:
			/*
			** Sender is one of the systerhood saying she
			** got the job structure sent and she accepts it.
			** I'm mother superior.
			**
			** auxiliary info (
			**	none;
			** )
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id,
					"got JOIN_JOB OKAY and I'm not MS");
				goto err;
			}
			for (i=0; i<pjob->ji_numnodes; i++) {
				np = &pjob->ji_hosts[i];
				if ((ep = (eventent *)GET_NEXT(np->hn_events))
						!= NULL)
					break;
			}
			if (ep == NULL)	{	/* no events */
				finish_exec(pjob);
				LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
					pjob->ji_qs.ji_jobid, log_buffer);
			}
			DBPRT(("%s: JOIN_JOB %s OKAY\n", id, jobid))
			break;

		case	IM_KILL_JOB:
			/*
			** Sender is sending a responce that a job
			** which needs to die has been given the ax.
			** I'm mother superior.
			**
			** auxiliary info (
			**	cput	int;
			**	mem	int;
			** )
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id,
					"got KILL_JOB OKAY and I'm not MS");
				goto err;
			}
			DBPRT(("%s: KILL_JOB %s OKAY\n", id, jobid))

			pjob->ji_resources[nodeidx-1].nr_cput =
				disrul(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			pjob->ji_resources[nodeidx-1].nr_mem =
				disrul(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;

			DBPRT(("%s: %s FINAL from %d cpu %lu sec mem %lu kb\n",
				id, jobid, nodeidx,
				pjob->ji_resources[nodeidx-1].nr_cput,
				pjob->ji_resources[nodeidx-1].nr_mem))

			/* don't close stream in case other jobs use it */
			np->hn_sister = SISTER_KILLDONE;
			for (i=1; i<pjob->ji_numnodes; i++) {
				if (pjob->ji_hosts[i].hn_sister == SISTER_OKAY)
					break;
			}
			if (i == pjob->ji_numnodes) {	/* all dead */
				DBPRT(("%s: ALL DONE, set EXITING job %s\n",
					id, jobid))
				pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
				exiting_tasks = 1;
			}
			break;

		case	IM_SPAWN_TASK:
			/*
			** Sender is MOM responding to a "spawn_task"
			** request.
			**
			** auxiliary info (
			**	task id		tm_task_id;
			** )
			*/
			taskid = disrsi(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			DBPRT(("%s: SPAWN_TASK %s OKAY task %ld\n",
				id, jobid, taskid))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			(void)diswsi(ptask->ti_fd, taskid);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_GET_TASKS:
			/*
			** Sender is MOM giving a list of tasks which she
			** has started for this job.
			**
			** auxiliary info (
			**	task id		tm_task_id;
			**	...
			**	task id		tm_task_id;
			** )
			*/
			DBPRT(("%s: GET_TASKS %s OKAY\n", id, jobid))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			for (;;) {
				DIS_rpp_reset();
				taskid = disrsi(stream, &ret);
				if (ret != DIS_SUCCESS) {
					if (ret == DIS_EOD)
						break;
					else
						goto err;
				}
				DIS_tcp_funcs();
				(void)diswsi(ptask->ti_fd, taskid);
			}
			DIS_tcp_funcs();
			(void)diswsi(ptask->ti_fd, TM_NULL_TASK);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_SIGNAL_TASK:
			/*
			** Sender is MOM with a good signal to report.
			**
			** auxiliary info (
			**	none;
			** )
			*/
			DBPRT(("%s: SIGNAL_TASK %s OKAY %ld\n",
				id, jobid, event_task))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_OBIT_TASK:
			/*
			** Sender is MOM with a death report.
			**
			** auxiliary info (
			**	exit value	int;
			** )
			*/
			exitval = disrsi(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			DBPRT(("%s: OBIT_TASK %s OKAY %ld exit val %d\n",
				id, jobid, event_task, exitval))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			(void)diswsi(ptask->ti_fd, exitval);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_GET_INFO:
			/*
			** Sender is MOM with a named info to report.
			**
			** auxiliary info (
			**	info		counted string;
			** )
			*/
			info = disrcs(stream, &len, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			DBPRT(("%s: GET_INFO %s OKAY %ld\n",
				id, jobid, event_task))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL) {
				free(info);
				break;
			}
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			(void)diswcs(ptask->ti_fd, info, len);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_GET_RESC:
			/*
			** Sender is MOM with a resource info to report.
			**
			** auxiliary info (
			**	info		counted string;
			** )
			*/
			info = disrst(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			DBPRT(("%s: GET_RESC %s OKAY %ld\n",
				id, jobid, event_task))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL) {
				free(info);
				break;
			}
			(void)tm_reply(ptask->ti_fd, TM_OKAY, event);
			(void)diswst(ptask->ti_fd, info);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_POLL_JOB:
			/*
			** I must be Mother Superior for the job and
			** this is a reply with job resources to
			** tally up.
			**
			** auxiliary info (
			**	recommendation	int;
			**	cput		u_long;
			**	mem		u_long;
			** )
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id, "got POLL_JOB and I'm not MS");
				goto err;
			}
			exitval = disrsi(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			pjob->ji_resources[nodeidx-1].nr_cput =
				disrul(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			pjob->ji_resources[nodeidx-1].nr_mem =
				disrul(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			DBPRT(("%s: POLL_JOB %s OKAY kill %d cpu %lu mem %lu\n",
				id, jobid, exitval,
				pjob->ji_resources[nodeidx-1].nr_cput,
				pjob->ji_resources[nodeidx-1].nr_mem))

			if (exitval)
				pjob->ji_nodekill = np->hn_node;
			break;

		case	IM_GET_TID:
			/*
			** Sender must be Mother Superior with a TID.
			** I will either do the spawn or forward the SPAWN
			** to the final destination.
			**
			** auxiliary info (
			**	task id		tm_task_id;
			** )
			*/
			if (check_ms(stream, pjob))
				goto fini;
			taskid = disrsi(stream, &ret);
			if (ret != DIS_SUCCESS)
				goto err;
			/*
			** Check to see if I need to forward the taskid
			** to another MOM.
			*/
			if (pjob->ji_nodeid != efwd.fe_node) {
				np = find_node(pjob, -1, efwd.fe_node);
				if (np == NULL)
					goto done;

				ep = event_alloc(IM_SPAWN_TASK, np,
					efwd.fe_event, efwd.fe_taskid);
				ret = im_compose(np->hn_stream, jobid, cookie,
					IM_SPAWN_TASK, efwd.fe_event,
					efwd.fe_taskid);
				if (ret != DIS_SUCCESS)
					goto done;
				ret = diswsi(np->hn_stream, pjob->ji_nodeid);
				if (ret != DIS_SUCCESS)
					goto done;
				ret = diswsi(np->hn_stream, taskid);
				if (ret != DIS_SUCCESS)
					goto done;
				ret = diswst(np->hn_stream, pjob->ji_globid);
				if (ret != DIS_SUCCESS)
					goto done;
				for (i=0; argv[i]; i++) {
					ret = diswst(np->hn_stream, argv[i]);
					if (ret != DIS_SUCCESS)
						goto done;
				}
				ret = diswst(np->hn_stream, "");
				if (ret != DIS_SUCCESS)
					goto done;
				for (i=0; envp[i]; i++) {
					ret= diswst(np->hn_stream, envp[i]);
					if (ret != DIS_SUCCESS)
						goto done;
				}
				ret = (rpp_flush(np->hn_stream) == -1) ?
					DIS_NOCOMMIT : DIS_SUCCESS;
				arrayfree(argv);
				arrayfree(envp);
				break;
			}

			/*
			** It's me, do the spawn.
			*/
			ret = 0;
			if ((ptask = task_create(pjob, taskid)) != NULL) {
				strcpy(ptask->ti_qs.ti_parentjobid, jobid);
				ptask->ti_qs.ti_parentnode = efwd.fe_node;
				ptask->ti_qs.ti_parenttask = efwd.fe_taskid;
				if (task_save(ptask) != -1)
					ret = start_process(ptask, argv, envp);
			}
			arrayfree(argv);
			arrayfree(envp);

			taskid = ptask->ti_qs.ti_task;
			ptask = task_check(pjob, efwd.fe_taskid);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd,
				(ret == -1) ? TM_ERROR : TM_OKAY,
				efwd.fe_event);
			(void)diswsi(ptask->ti_fd, (int)(ret == -1 ?
					TM_ESYSTEM : taskid));
			(void)DIS_tcp_wflush(ptask->ti_fd);

			break;

		default:
			sprintf(log_buffer, "unknown request type %d saved",
				event_com);
			log_err(-1, id, log_buffer);
			break;
		}
		break;

	case	IM_ERROR:		/* this is a REPLY */
		/*
		** Sender is responding to a request with an error code.
		**
		** auxiliary info (
		**	error value	int;
		** )
		*/
		errcode = disrsi(stream, &ret);
		if (ret != DIS_SUCCESS)
			goto err;

		switch (event_com) {

		case	IM_JOIN_JOB:
			/*
			** A MOM has rejected a request to join a job.
			** We need to send a ABORT_JOB to all the sisterhood
			** and fail the job start to server.
			** I'm mother superior.
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id,
					"JOIN_JOB ERROR and I'm not MS");
				goto err;
			}
			DBPRT(("%s: JOIN_JOB %s returned ERROR %d\n",
				id, jobid, errcode))
			job_start_error(pjob, errcode, netaddr(addr));
			break;

		case	IM_ABORT_JOB:
		case	IM_KILL_JOB:
			/*
			** Job cleanup failed on a sister.
			** Wait for everybody to respond then finishup.
			** I'm mother superior.
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id,
					"KILL_JOB %s ERROR and I'm not MS");
				goto err;
			}
			DBPRT(("%s: KILL/ABORT JOB %s returned ERROR %d\n",
				id, jobid, errcode))
			np->hn_sister = errcode ? errcode : SISTER_KILLDONE;
			for (i=1; i<pjob->ji_numnodes; i++) {
				if (pjob->ji_hosts[i].hn_sister == SISTER_OKAY)
					break;
			}
			if (i == pjob->ji_numnodes) {	/* all dead */
				pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
				exiting_tasks = 1;
			}
			break;

		case	IM_SPAWN_TASK:
		case	IM_GET_TASKS:
		case	IM_SIGNAL_TASK:
		case	IM_OBIT_TASK:
		case	IM_GET_INFO:
			/*
			** A user attemt failed, inform process.
			*/
			DBPRT(("%s: REQUEST %d %s returned ERROR %d\n",
				id, event_com, jobid, errcode))
			ptask = task_check(pjob, event_task);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_ERROR, event);
			(void)diswsi(ptask->ti_fd, errcode);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		case	IM_POLL_JOB:
			/*
			** I must be Mother Superior for the job and
			** this is an error reply to a poll request.
			*/
			if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) {
				log_err(-1, id,
					"POLL_JOB %s ERROR and I'm not MS");
				goto err;
			}
			DBPRT(("%s: POLL_JOB %s returned ERROR %d\n",
				id, jobid, errcode))
			np->hn_sister = errcode ? errcode : SISTER_BADPOLL;
			break;

		case	IM_GET_TID:
			/*
			** Sender must be Mother Superior failing to
			** send a TID.
			** Send a fail to the task which called SPAWN.
			*/
			if (check_ms(stream, pjob))
				goto fini;
			DBPRT(("%s: GET_TID %s returned ERROR %d\n",
				id, jobid, errcode))
			arrayfree(argv);
			arrayfree(envp);

			ptask = task_check(pjob, efwd.fe_taskid);
			if (ptask == NULL)
				break;
			(void)tm_reply(ptask->ti_fd, TM_ERROR, efwd.fe_event);
			(void)diswsi(ptask->ti_fd, errcode);
			(void)DIS_tcp_wflush(ptask->ti_fd);
			break;

		default:
			sprintf(log_buffer, "unknown command %d error", 
					event_com);
			log_err(-1, id, log_buffer);
			goto err;
		}
		break;

	default:
		sprintf(log_buffer, "unknown command %d sent", command);
		log_err(-1, id, log_buffer);
		goto err;
	}
 done:
	rpp_eom(stream);
	if (reply) {	/* check if write worked */
		if (ret != DIS_SUCCESS ||
				rpp_flush(stream) == -1) {
			log_err(errno, id, "rpp_flush");
			rpp_close(stream);
			if (np != NULL && np->hn_stream == stream)
				np->hn_stream = -1;
		}
	}
	goto fini;

 err:
	/*
	** We come here if we got a DIS read error or a protocol
	** element is missing.  The likely case is the remote
	** host has gone down.
	*/
	sprintf(log_buffer, "job %s: command %d",
			jobid ? jobid : "unknown", command);
	log_err(-1, id, log_buffer);
	im_eof(stream, ret);

 fini:
	if (jobid)
		free(jobid);
	if (cookie)
		free(cookie);

	return;
}

void
tm_eof(fd)
     int	fd;
{
	static	char	id[] = "tm_eof";
	job			*pjob;
	task			*ptask;

	/*
	** Search though all the jobs looking for this fd.
	*/
	for (pjob = (job *)GET_NEXT(svr_alljobs);
			pjob != NULL;
			pjob = (job *)GET_NEXT(pjob->ji_alljobs)) {
		for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
				ptask;
				ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
			if (ptask->ti_fd == fd) {
				ptask->ti_fd = -1;
				return;
			}
		}
	}
	log_err(-1, id, "no matching task found");
	return;
}

/*
**	Input is coming from a process running on this host which
**	should be part of one of the jobs I am part of.  The i/o
**	will take place using DIS over a tcp fd.
**
**	Read the stream to get a task manager request.  Format the reply
**	and write it back.
**
**	read (
**		jobid			string
**		cookie			string
**		command			int
**		event			int
**		from taskid		int
**	)
**
*/
int
tm_request(fd, version)
     int	fd;
     int	version;
{
	char			*id = "tm_request";
	int			command, reply;
	int			ret = DIS_SUCCESS;
	char			*jobid = NULL;
	char			*cookie = NULL;
	char			*oreo;
	job			*pjob;
	task			*ptask;
	vnodent			*pnode;
	hnodent			*phost;
	int			i, len, event, numele;
	long			ipadd;
	char			**argv, **envp;
	char			*name, *info;
	eventent		*ep;
	infoent			*ip;
	int			signum;
	char			*nodestr = NULL;
	char			*pname;
	int			vnodenum;
	int			prev_error = 0;
	tm_node_id		nodeid;
	tm_task_id		taskid, fromtask;
	attribute		*at;
	extern	u_long		localaddr;
	extern	struct	connection	svr_conn[];
	int	start_process		A_((	task	*ptask,
						char	**argv,
						char	**envp));

	if (svr_conn[fd].cn_addr != localaddr) {
		sprintf(log_buffer, "non-local connect");
		goto err;
	}
	if (version != TM_PROTOCOL_VER) {
		sprintf(log_buffer, "bad protocol version %d", version);
		goto err;
	}

	jobid = disrst(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	cookie = disrst(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	command = disrsi(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	event = disrsi(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;
	fromtask = disrui(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;

	DBPRT(("%s: job %s cookie %s task %ld com %d event %d\n", id, jobid,
			cookie, fromtask, command, event))

	/* verify the jobid is known and the cookie matches */
	if ((pjob = find_job(jobid)) == (job *)0) {
		sprintf(log_buffer, "job %s not found", jobid);
		goto err;
	}

	if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) {
		sprintf(log_buffer, "job %s not running", jobid);
		goto err;
	}

	at = &pjob->ji_wattr[(int)JOB_ATR_Cookie];
	if ( !(at->at_flags & ATR_VFLAG_SET)) {
		sprintf(log_buffer, "job %s has no cookie", jobid);
		goto err;
	}
	oreo = at->at_val.at_str;
	if (strcmp(oreo, cookie) != 0) {
		sprintf(log_buffer, "job %s cookie %s message %s",
			jobid, oreo, cookie);
		goto err;
	}

	/* verify this taskid is my baby */
	ptask = task_find(pjob, fromtask);
	if (ptask == NULL) {	/* not found */
		sprintf(log_buffer,
			"task %ld in job %s not found",
			fromtask, jobid);
		log_err(-1, id, log_buffer);
		ret = tm_reply(fd, TM_ERROR, event);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswsi(fd, TM_ENOTFOUND);
		if (ret != DIS_SUCCESS)
			goto done;
		prev_error = 1;
	}
	else if (ptask->ti_fd != -1 && ptask->ti_fd != fd) {
		/* we should have a new fd or match the old */
		sprintf(log_buffer,
			"extra TM connect from %s task %ld",
			jobid, fromtask);
		log_err(-1, id, log_buffer);
		goto err;
	}

	svr_conn[fd].cn_oncl = tm_eof;
	ptask->ti_fd = fd;
	reply = TRUE;

	switch (command) {

	case TM_INIT:
		/*
		** A request to initialize.  Must be the first
		** thing we see from a task to do psched requests.
		*/
		DBPRT(("%s: INIT %s\n", id, jobid))
		if (prev_error)
			goto done;

		ret = tm_reply(fd, TM_OKAY, event);
		if (ret != DIS_SUCCESS)
			goto done;
		vnodenum = pjob->ji_numvnod;
		ret = diswui(fd, vnodenum);	/* num nodes */
		if (ret != DIS_SUCCESS)
			goto done;

		pnode = pjob->ji_vnods;
		for (i=0; i<vnodenum; i++) {
			ret = diswsi(fd, pnode[i].vn_node);
			if (ret != DIS_SUCCESS)
				goto done;
		}
		ret = diswst(fd, ptask->ti_qs.ti_parentjobid);	/* dad job */
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswsi(fd, ptask->ti_qs.ti_parentnode);	/* dad node */
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswsi(fd, ptask->ti_qs.ti_parenttask);	/* dad task */
		if (ret != DIS_SUCCESS)
			goto done;

		ptask->ti_flags |= TI_FLAGS_INIT;
		goto done;

	case TM_POSTINFO:
		/*
		** Post named info for a task.
		**
		**	read (
		**		name		string;
		**		info		counted string;
		**	)
		*/
		name = disrst(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		info = disrcs(fd, (size_t *)&len, &ret);
		if (ret != DIS_SUCCESS) {
			free(name);
			goto err;
		}
		DBPRT(("%s: POSTINFO %s task %ld sent info %s:%s(%d)\n", id,
			jobid, fromtask, name, info, len))
		if (prev_error)
			goto done;

		task_saveinfo(ptask, name, info, len);
		ret = tm_reply(fd, TM_OKAY, event);
		goto done;

	case TM_REGISTER:
		sprintf(log_buffer, "REGISTER - NOT IMPLEMENTED %s", jobid);
		(void)tm_reply(fd, TM_ERROR, event);
		(void)diswsi(fd, TM_ENOTIMPLEMENTED);
		(void)DIS_tcp_wflush(fd);
		goto err;

	default:
		break;
	}

	/*
	** All requests beside TM_INIT and TM_POSTINFO
	** require a node number where the action will take place.
	** Read that and check that it is legal.
	**
	**	read (
	**		node number		int
	**	)
	*/
	nodeid = disrui(fd, &ret);
	if (ret != DIS_SUCCESS)
		goto err;

	pnode = pjob->ji_vnods;
	for (i=0; i<pjob->ji_numvnod; i++, pnode++) {
		if (pnode->vn_node == nodeid)
			break;
	}
	if (i == pjob->ji_numvnod) {
		sprintf(log_buffer,
			"node %d in job %s not found",
			nodeid, jobid);
		log_err(-1, id, log_buffer);
		ret = tm_reply(fd, TM_ERROR, event);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswsi(fd, TM_ENOTFOUND);
		if (ret != DIS_SUCCESS)
			goto done;
		prev_error = 1;
	}
	phost = pnode->vn_host;


	switch (command) {

	case TM_TASKS:
		/*
		** A request to read the list of tasks that a
		** particular node has charge of.
		*/
		DBPRT(("%s: TASKS %s on node %d\n",
			id, jobid, nodeid))
		if (prev_error)
			goto done;

		if (pjob->ji_nodeid != nodeid) {	/* not me */
			ep = event_alloc(IM_GET_TASKS, phost, event, fromtask);
			if (phost->hn_stream == -1) {
				phost->hn_stream = rpp_open(phost->hn_host,
							pbs_rm_port);
			}
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_GET_TASKS, event, fromtask);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswui(phost->hn_stream, pjob->ji_nodeid); /* XXX */
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}
		ret = tm_reply(fd, TM_OKAY, event);
		if (ret != DIS_SUCCESS)
			goto done;
		for (ptask=(task *)GET_NEXT(pjob->ji_tasks);
				ptask;
				ptask=(task *)GET_NEXT(ptask->ti_jobtask)) {
			ret = diswui(fd, ptask->ti_qs.ti_task);
			if (ret != DIS_SUCCESS)
				goto done;
		}
		ret = diswui(fd, TM_NULL_TASK);
		break;

	case TM_SPAWN:
		/*
		** Spawn a task on the requested node.
		**
		**	read (
		**		argc		int;
		**		arg 0		string;
		**		...
		**		arg argc-1	string;
		**		env 0		string;
		**		...
		**		env m		string;
		**	)
		*/
		DBPRT(("%s: SPAWN %s on node %d\n",
			id, jobid, nodeid))
		numele = disrui(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto done;
		argv = (char **)calloc(numele+1, sizeof(char **));
		assert(argv);
		for (i=0; i<numele; i++) {
			argv[i] = disrst(fd, &ret);
			if (ret != DIS_SUCCESS) {
				arrayfree(argv);
				goto done;
			}
		}
		argv[i] = NULL;

		numele = 3;
		envp = (char **)calloc(numele, sizeof(char **));
		assert(envp);
		for (i=0;; i++) {
			char	*env;

			env = disrst(fd, &ret);
			if (ret != DIS_SUCCESS && ret != DIS_EOD) {
				arrayfree(argv);
				arrayfree(envp);
				goto done;
			}
			if (env == NULL)
				break;
			if (*env == '\0') {
				free(env);
				break;
			}
			if (i == numele) {
				numele *= 2;
				envp = (char **)realloc(envp,
					numele*sizeof(char **));
				assert(envp);
			}
			envp[i] = env;
		}
		envp[i] = NULL;
		ret = DIS_SUCCESS;

		if (prev_error) {
			arrayfree(argv);
			arrayfree(envp);
			goto done;
		}

		/*
		** If I'm Mother Suerior and the spawn happens on
		** me, just do it.
		*/
		if (pjob->ji_nodeid == 0 && pjob->ji_nodeid == nodeid) {  /* XXX */
			i = TM_ERROR;
			ptask = task_create(pjob, TM_NULL_TASK);
			if (ptask != NULL) {
				strcpy(ptask->ti_qs.ti_parentjobid, jobid);
				ptask->ti_qs.ti_parentnode = pjob->ji_nodeid;
				ptask->ti_qs.ti_parenttask = fromtask;
				if (task_save(ptask) != -1) {
					ret = start_process(ptask, argv, envp);
					if (ret != -1)
						i = TM_OKAY;
				}
			}
			arrayfree(argv);
			arrayfree(envp);
			ret = tm_reply(fd, i, event);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(fd, ((i == TM_ERROR) ?
					TM_ESYSTEM :
					ptask->ti_qs.ti_task));
			goto done;
		}
		/*
		** If I'm a regular mom and the destination is not
		** MS, just send a GET_TID to MS.
		*/
		else if (pjob->ji_nodeid != 0 &&
				nodeid != pjob->ji_vnods[0].vn_node) { /* XXX */
			pnode = &pjob->ji_vnods[0];

			ep = event_alloc(IM_GET_TID, pnode->vn_host,
					TM_NULL_EVENT, TM_NULL_TASK);
			ep->ee_argv = argv;
			ep->ee_envp = envp;
			ep->ee_forward.fe_node = nodeid;
			ep->ee_forward.fe_event = event;
			ep->ee_forward.fe_taskid = fromtask;
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_GET_TID, ep->ee_event,
					TM_NULL_TASK);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}

		/*
		** If I am MS, generate the TID now, otherwise
		** we are sending to MS who will do it when she gets
		** the SPAWN.
		*/
		taskid = (pjob->ji_nodeid == 0) ?
			pjob->ji_taskid++ : TM_NULL_TASK;

		ep = event_alloc(IM_SPAWN_TASK, phost, event, fromtask);
		if (phost->hn_stream == -1) {
			phost->hn_stream = rpp_open(phost->hn_host,
						pbs_rm_port);
		}
		ret = im_compose(phost->hn_stream, jobid, cookie,
				IM_SPAWN_TASK, event, fromtask);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswui(phost->hn_stream, pjob->ji_nodeid);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswui(phost->hn_stream, taskid);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswst(phost->hn_stream, pjob->ji_globid);
		if (ret != DIS_SUCCESS)
			goto done;
		for (i=0; argv[i]; i++) {
			ret = diswst(phost->hn_stream, argv[i]);
			if (ret != DIS_SUCCESS)
				goto done;
		}
		ret = diswst(phost->hn_stream, "");
		if (ret != DIS_SUCCESS)
			goto done;
		for (i=0; envp[i]; i++) {
			ret = diswst(phost->hn_stream, envp[i]);
			if (ret != DIS_SUCCESS)
				goto done;
		}
		ret = (rpp_flush(phost->hn_stream) == -1) ?
			DIS_NOCOMMIT : DIS_SUCCESS;
		if (ret != DIS_SUCCESS)
			goto done;
		reply = FALSE;
		arrayfree(argv);
		arrayfree(envp);

		break;

	case TM_SIGNAL:
		/*
		** Send a signal to the specified task.
		**
		**	read (
		**		to task			int
		**		signal			int
		**	)
		*/
		taskid = disrui(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		signum = disrui(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: SIGNAL %s on node %d task %ld sig %d\n",
			id, jobid, nodeid, taskid, signum))
		if (prev_error)
			goto done;

		if (pjob->ji_nodeid != nodeid) {	/* not me XXX */
			ep = event_alloc(IM_SIGNAL_TASK, phost,
					event, fromtask);
			if (phost->hn_stream == -1) {
				phost->hn_stream = rpp_open(phost->hn_host,
							pbs_rm_port);
			}
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_SIGNAL_TASK, event, fromtask);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswui(phost->hn_stream, pjob->ji_nodeid); /* XXX */
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(phost->hn_stream, taskid);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(phost->hn_stream, signum);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}

		/*
		** Task should be here... look for it.
		*/
		if ((ptask = task_find(pjob, taskid)) == NULL) {
			ret = tm_reply(fd, TM_ERROR, event);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(fd, TM_ENOTFOUND);
			break;
		}
		kill_task(ptask, signum);
		ret = tm_reply(fd, TM_OKAY, event);
		break;

	case TM_OBIT:
		/*
		** Register an obit request for the specified task.
		**
		**	read (
		**		task to watch		int
		**	)
		*/
		taskid = disrui(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: OBIT %s on node %d task %ld\n",
			id, jobid, nodeid, taskid))
		if (prev_error)
			goto done;

		if (pjob->ji_nodeid != nodeid) {	/* not me */
			ep = event_alloc(IM_OBIT_TASK, phost, event, fromtask);
			if (phost->hn_stream == -1) {
				phost->hn_stream = rpp_open(phost->hn_host,
							pbs_rm_port);
			}
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_OBIT_TASK, event, fromtask);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswui(phost->hn_stream, pjob->ji_nodeid);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(phost->hn_stream, taskid);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}
		/*
		** Task should be here... look for it.
		*/
		if ((ptask = task_find(pjob, taskid)) == NULL) {
			ret = tm_reply(fd, TM_ERROR, event);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(fd, TM_ENOTFOUND);
			break;
		}
		if (ptask->ti_qs.ti_status >= TI_STATE_EXITED) {
			ret = tm_reply(fd, TM_OKAY, event);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswsi(fd, ptask->ti_qs.ti_exitstat);
		}
		else {
			obitent	*op = (obitent *)malloc(sizeof(obitent));
			assert(op);
			CLEAR_LINK(op->oe_next);
			append_link(&ptask->ti_obits, &op->oe_next, op);
			op->oe_info.fe_node = nodeid;
			op->oe_info.fe_event = event;
			op->oe_info.fe_taskid = fromtask;
			reply = 0;
		}
		break;

	case TM_GETINFO:
		/*
		** Get named info for a specified task.
		**
		**	read (
		**		task			int
		**		name			string
		**	)
		*/
		taskid = disrui(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		name = disrst(fd, &ret);
		if (ret != DIS_SUCCESS)
			goto err;
		DBPRT(("%s: GETINFO %s from node %d task %ld name %s\n",
			id, jobid, nodeid, taskid, name))
		if (prev_error)
			goto done;

		if (pjob->ji_nodeid != nodeid) {	/* not me */
			ep = event_alloc(IM_GET_INFO, phost,
					event, fromtask);
			if (phost->hn_stream == -1) {
				phost->hn_stream = rpp_open(phost->hn_host,
							pbs_rm_port);
			}
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_GET_INFO, event, fromtask);
			if (ret == DIS_SUCCESS) {
				ret = diswui(phost->hn_stream, pjob->ji_nodeid);
				if (ret == DIS_SUCCESS) {
					ret = diswsi(phost->hn_stream, taskid);
					if (ret == DIS_SUCCESS) {
						ret = diswst(phost->hn_stream,
								name);
					}
				}
			}
			free(name);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}

		/*
		** Task should be here... look for it.
		*/
		if ((ptask = task_find(pjob, taskid)) != NULL) {
			if ((ip = task_findinfo(ptask, name)) != NULL) {
				ret = tm_reply(fd, TM_OKAY, event);
				if (ret != DIS_SUCCESS)
					goto done;
				ret = diswcs(fd, ip->ie_info, ip->ie_len);
				break;
			}
		}
		ret = tm_reply(fd, TM_ERROR, event);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswsi(fd, TM_ENOTFOUND);
		break;

	case TM_RESOURCES:
		/*
		** Get resource string for a node.
		*/
		DBPRT(("%s: RESOURCES %s for node %d task %ld\n",
			id, jobid, nodeid, taskid))
		if (prev_error)
			goto done;

		if (pjob->ji_nodeid != nodeid) {	/* not me XXX */
			ep = event_alloc(IM_GET_RESC, phost,
					event, fromtask);
			if (phost->hn_stream == -1) {
				phost->hn_stream = rpp_open(phost->hn_host,
							pbs_rm_port);
			}
			ret = im_compose(phost->hn_stream, jobid, cookie,
					IM_GET_RESC, event, fromtask);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = diswui(phost->hn_stream, pjob->ji_nodeid);
			if (ret != DIS_SUCCESS)
				goto done;
			ret = (rpp_flush(phost->hn_stream) == -1) ?
				DIS_NOCOMMIT : DIS_SUCCESS;
			if (ret != DIS_SUCCESS)
				goto done;
			reply = FALSE;
			goto done;
		}

		info = resc_string(pjob);
		ret = tm_reply(fd, TM_OKAY, event);
		if (ret != DIS_SUCCESS)
			goto done;
		ret = diswst(fd, info);
		free(info);
		break;

	default:
		sprintf(log_buffer, "unknown command %d", command);
		(void)tm_reply(fd, TM_ERROR, event);
		(void)diswsi(fd, TM_EUNKNOWNCMD);
		(void)DIS_tcp_wflush(fd);
		goto err;
	}

 done:
	if (reply) {
		DBPRT(("%s: REPLY %s\n", id, dis_emsg[ret]))
		if (ret != DIS_SUCCESS || DIS_tcp_wflush(fd) == -1) {
			sprintf(log_buffer, "comm failed %s", dis_emsg[ret]);
			log_err(errno, id, log_buffer);
			close_conn(fd);
		}
	}

	free(jobid);
	free(cookie);
	return 0;

 err:
	if (ret != DIS_SUCCESS)
		sprintf(log_buffer, "bad header %s", dis_emsg[ret]);
	log_err(errno, id, log_buffer);

	ipadd = svr_conn[fd].cn_addr;
	sprintf(log_buffer,
		"message refused from port %d addr %ld.%ld.%ld.%ld",
		svr_conn[fd].cn_port,
		(ipadd & 0xff000000) >> 24,
		(ipadd & 0x00ff0000) >> 16,
		(ipadd & 0x0000ff00) >> 8,
		(ipadd & 0x000000ff));
	close_conn(fd);
	if (jobid)
		free(jobid);
	if (cookie)
		free(cookie);
	return -1;
}
