/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
/*
 * req_runjob.c - functions dealing with a Run Job Request
 */

#include <pbs_config.h>   /* the master config generated by configure */

#include <sys/types.h>
#include <sys/wait.h>
#include <signal.h>
#include <stdlib.h>
#include "libpbs.h"
#include "server_limits.h"
#include "list_link.h"
#include "attribute.h"
#include "resource.h"
#include "server.h"
#include "credential.h"
#include "batch_request.h"
#include "job.h"
#include "queue.h"
#include "work_task.h"
#include "pbs_error.h"
#include "log.h"
#include "acct.h"
#include "svrfunc.h"

static char ident[] = "@(#) $RCSfile: req_runjob.c,v $ $Revision: 2.4.2.1 $";

/* External Functions Called: */

extern int   send_job A_((job *, pbs_net_t, int, int, void (*x)(), struct batch_request * ));
extern void  set_resc_assigned A_((job *, enum batch_op));
extern struct batch_request *cpy_stage A_((struct batch_request *, job *, enum job_atr, int));

/* Public Functions in this file */

int  svr_startjob A_((job *, struct batch_request *));

/* Private Function local to this file */

static void post_sendmom A_((struct work_task *));
static int  svr_stagein A_((job *, struct batch_request *, int, int)); 
static int  svr_strtjob2 A_((job *, struct batch_request *));
static job *chk_job_torun A_((struct batch_request *preq));
static int  assign_hosts A_((job *, char *given, int setflag));

/* Global Data Items: */

extern pbs_net_t pbs_mom_addr;
extern int	 pbs_mom_port;
extern struct server server;
extern char  server_host[PBS_MAXHOSTNAME+1];
extern char  server_name[PBS_MAXSERVERNAME+1];
extern char *msg_badexit;
extern char *msg_jobrun;
extern char *msg_manager;
extern char *msg_stageinfail;
extern int   scheduler_jobct;
extern int   scheduler_sock;
extern time_t time_now;
extern int   svr_totnodes;	/* non-zero if using nodes */
extern int   svr_tsnodes;	/* non-zero if time-shared nodes */

/*
 * req_runjob - service the Run Job and Asyc Run Job Requests
 *
 *	This request forces a job into execution.  Client must be privileged.
 */

void req_runjob(preq)
	struct batch_request *preq;
{
	job		 *pjob;
	int		  rc;

	if ((pjob = chk_job_torun(preq)) == (job *)0)
		return;

	if (preq->rq_conn == scheduler_sock)
		++scheduler_jobct;	/* see scheduler_close() */

	(void)sprintf(log_buffer, msg_manager, msg_jobrun, 
		      preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, 
		  pjob->ji_qs.ji_jobid, log_buffer);

	/* If async run, reply now; otherwise reply is handled in */
	/* post_sendmom or post_stagein				  */

	if (preq && (preq->rq_type == PBS_BATCH_AsyrunJob)) {
		reply_ack(preq);
		preq = 0;	/* cleared so we don't try to reuse */
	}

	if (((rc = svr_startjob(pjob, preq)) != 0) && preq) {
		free_nodes(pjob);
		req_reject(rc, 0, preq);
	}
}

/*
 * req_stagein - service the Stage In Files for a Job Request
 *
 *	This request causes MOM to start stagin in files. 
 *	Client must be privileged.
 */

void req_stagein(preq)
	struct batch_request *preq;
{
	job		 *pjob;
	int		  rc;


	if ((pjob = chk_job_torun(preq)) == (job *)0) {
		return;
	} else if ((pjob->ji_wattr[(int)JOB_ATR_stagein].at_flags&ATR_VFLAG_SET) ==0) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	} else if (rc = svr_stagein(pjob, preq,
		                  JOB_STATE_QUEUED, JOB_SUBSTATE_STAGEIN)) {
		free_nodes(pjob);
		req_reject(rc, 0, preq);
	}
}

/*
 * post_stagein - process reply from MOM to stage-in request
 */

static void post_stagein(pwt)
	struct work_task *pwt;
{
	int		      code;
	int		      newstate;
	int		      newsub;
	job		     *pjob;
	struct batch_request *preq;
	attribute	     *pwait;

	preq = pwt->wt_parm1;
	code = preq->rq_reply.brp_code;
	pjob = find_job(preq->rq_extra);
	free(preq->rq_extra);

	if (pjob != (job *)0) {

	    if (code != 0) {

		/* stage in failed - hold job */

		free_nodes(pjob);
		pwait = &pjob->ji_wattr[(int)JOB_ATR_exectime];
		if ((pwait->at_flags & ATR_VFLAG_SET) == 0) {
			pwait->at_val.at_long = time_now + PBS_STAGEFAIL_WAIT;
			pwait->at_flags |= ATR_VFLAG_SET;
			(void)job_set_wait(pwait, pjob, 0);
		}
		(void)svr_setjobstate(pjob, JOB_STATE_WAITING, JOB_SUBSTATE_STAGEFAIL);
		if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Text)
			svr_mailowner(pjob, MAIL_STAGEIN, MAIL_FORCE,
				      preq->rq_reply.brp_un.brp_txt.brp_str);
	    } else {
		/* stage in was successful */
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_StagedIn;
		if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) {
			/* continue to start job running */
			(void)svr_strtjob2(pjob, (struct batch_request *)0);
		} else {
			svr_evaljobstate(pjob, &newstate, &newsub, 0);
			(void)svr_setjobstate(pjob, newstate, newsub);
		}
	    }
	}
	release_req(pwt);	/* close connection and release request */
	
}

/*
 * svr_stagein - direct MOM to stage in the requested files for a job
 */

static int svr_stagein(pjob, preq, state, substate)
	job *pjob;
	struct batch_request *preq;
	int state;
	int substate;
{
	struct batch_request *momreq = 0;
	int		      rc;

	momreq = cpy_stage(momreq, pjob, JOB_ATR_stagein, STAGE_DIR_IN);
	if (momreq) {	/* have files to stage in */

		/* save job id for post_stagein */

		momreq->rq_extra = malloc(PBS_MAXSVRJOBID+1);
		if (momreq->rq_extra == 0) 
			return (PBSE_SYSTEM);
		(void)strcpy(momreq->rq_extra, pjob->ji_qs.ji_jobid);
		rc = relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
				  momreq, post_stagein);
		if (rc == 0) {

			(void)svr_setjobstate(pjob, state, substate);
			/*
			 * stage-in started ok - reply to client as copy may
			 * take too long to wait.
			 */

			if (preq)
				reply_ack(preq);
		} else {
			free(momreq->rq_extra);
		}
		return (rc);

	} else {

		/* no files to stage-in, go direct to sending job to mom */
			
		return ( svr_strtjob2(pjob, preq) );
	}
}

/*
 * svr_startjob - place a job into running state by shipping it to MOM
 */

int svr_startjob(pjob, preq)
	job  *pjob;			/* job to run */
	struct batch_request *preq;	/* NULL or Run Job batch request */
{
	int f;
	int rc;

	/* if not already setup, transfer the control/script file basename */
	/* into an attribute accessable to MOM				   */

	if (!(pjob->ji_wattr[(int)JOB_ATR_hashname].at_flags & ATR_VFLAG_SET))
		if (job_attr_def[(int)JOB_ATR_hashname].at_decode(
				&pjob->ji_wattr[(int)JOB_ATR_hashname],
				(char *)0, (char *)0, 
				pjob->ji_qs.ji_fileprefix))
			return (PBSE_SYSTEM);


	/* if exec_host alread set and either (hot start or checkpoint)	*/
	/* then use the host(s) listed in exec_host			*/

	rc = 0;
	f = pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET;
	if (f && ( (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HOTSTART) ||
	           (pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHKPT) ) 
	      && ( (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HasNodes) == 0) ) {

		rc = assign_hosts(pjob, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str, 0);
	} else if (f == 0) {
		/* exec_host not already set, get hosts and set it */
		rc = assign_hosts(pjob, NULL, 1);
	}
	if (rc != 0)
			return rc;
				

	/* Next, are there files to be staged-in? */

	if ((pjob->ji_wattr[(int)JOB_ATR_stagein].at_flags & ATR_VFLAG_SET) &&
	    (pjob->ji_qs.ji_substate != JOB_SUBSTATE_STAGECMP)) {

		/* yes, we do that first; then start the job */

		rc = svr_stagein(pjob, preq, JOB_STATE_RUNNING,
						JOB_SUBSTATE_STAGEGO);

		/* note, the positive acknowledgment is done by svr_stagein */

	} else {

		/* No stage-in or already done, start job executing */

		rc = svr_strtjob2(pjob, preq);
	}
	return (rc);
}

static int svr_strtjob2(pjob, preq)
	job		     *pjob;
	struct batch_request *preq;
{
	int	old_state;
	int	old_subst;


	old_state = pjob->ji_qs.ji_state;
	old_subst = pjob->ji_qs.ji_substate;

	/* send the job to MOM */

	(void)svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_PRERUN);

	if ( send_job(pjob, pjob->ji_qs.ji_un.ji_exect.ji_momaddr,
		      pbs_mom_port, MOVE_TYPE_Exec, post_sendmom,
		      (void *)preq) == 2) {
		return (0);
	} else {
		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			  pjob->ji_qs.ji_jobid,
			  "Unable to Run Job, send to Mom failed");
		pjob->ji_qs.ji_destin[0] = '\0';
		(void)svr_setjobstate(pjob, old_state, old_subst);
		return (pbs_errno);
	}
}

/*
 * post_sendmom - clean up action for child started in send_job
 *	which was sending a job "home" to MOM
 *
 * If send was successfull, mark job as executing, and call stat_mom_job()
 * to obtain session id.
 *
 * If send didn't work, requeue the job.
 *
 * If the work_task has a non-null wt_parm2, it is the address of a batch
 * request to which a reply must be sent.
 *
 * Returns: none.
 */

static void post_sendmom(pwt)
	struct work_task *pwt;
{
	char	*id = "post_sendmom";
	int	 newstate;
	int	 newsub;
	int	 r;
	int	 stat = pwt->wt_aux;
	job	*jobp = (job *)pwt->wt_parm1;
	struct batch_request *preq = (struct batch_request *)pwt->wt_parm2;

	if (WIFEXITED(stat)) {

		r = WEXITSTATUS(stat);
	} else {
		r = 2;
		(void)sprintf(log_buffer, msg_badexit, stat);
		(void)strcat(log_buffer, id);
		log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_JOB,
			  jobp->ji_qs.ji_jobid, log_buffer);
	}

	switch (r) {

	case 0:		/* send to MOM went ok */

		jobp->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART;
		if (preq)
			reply_ack(preq);
			
		/* record start time for accounting */
		jobp->ji_qs.ji_stime = time_now;

		if (jobp->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) {
			/* may be EXITING if job finished first */
			(void)svr_setjobstate(jobp, JOB_STATE_RUNNING,
						    JOB_SUBSTATE_RUNNING);	
			/* above saves job structure */
		}

		/* accounting log for start or restart */
		if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_CHKPT)
			account_record(PBS_ACCT_RESTRT, jobp, (char *)0);
		else
			account_jobstr(jobp);

		/* update resource usage attributes */

		set_resc_assigned(jobp, INCR);

		/* if any dependencies, see if action required */
		
		if (jobp->ji_wattr[(int)JOB_ATR_depend].at_flags&ATR_VFLAG_SET)
			(void)depend_on_exec(jobp);

		svr_mailowner(jobp, MAIL_BEGIN, MAIL_NORMAL, (char *)0);
		/*
		 * it is unfortunate, but while the job has gone into execution,
		 * there is no way of obtaining the session id except by making
		 * a status request of MOM.  (Even if the session id was passed
		 * back to the sending child, it couldn't get up to the parent.)
		 */
		jobp->ji_momstat = 0;
		stat_mom_job(jobp);
		break;

	default :	/* send failed, requeue the job */

		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
			  jobp->ji_qs.ji_jobid,
			  "Unable to Run Job, MOM rejected");

		free_nodes(jobp);

		if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_ABORT) {
			if (preq)
			    req_reject(PBSE_MOMREJECT, 0, preq);

			svr_evaljobstate(jobp, &newstate, &newsub, 1);
			(void)svr_setjobstate(jobp, newstate, newsub);
		} else {
			if (preq)
			    req_reject(PBSE_BADSTATE, 0, preq);
		}
			
		break;
	}
	return;
}

/*
 * chk_job_torun - check state and past execution host of a job for which
 *	files are about to be staged in or the job is about to be run.
 * 	Returns pointer to job if all is ok, else returns null.
 */

static job *chk_job_torun(preq)
	struct batch_request *preq;
{
	attribute_def	 *pdef;
	attribute	 *pattr;
	job		 *pjob;
	struct rq_runjob *prun;
	int 		  rc;

	prun = &preq->rq_ind.rq_run;
	if ((pjob = chk_job_request(prun->rq_jid, preq)) == 0)
		return (job *)0;

	if ((pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)       ||
	    (pjob->ji_qs.ji_state == JOB_STATE_EXITING)	      ||
	    (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) ||
	    (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)  ||
	    (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))  {
		req_reject(PBSE_BADSTATE, 0, preq);
		return (job *)0;
	}
	if (preq->rq_type == PBS_BATCH_StageIn) {
		if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEIN) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return (job *)0;
		}
	}

	if ((preq->rq_perm & ( ATR_DFLAG_MGWR | ATR_DFLAG_OPWR )) == 0) {
		req_reject(PBSE_PERM, 0, preq);
		return (job *)0;
	}

	/* the job must be in an execution queue */

	if (pjob->ji_qhdr->qu_qs.qu_type != QTYPE_Execution) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return (job *)0;
	}

	/* where to execute the job */

	if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHKPT | JOB_SVFLG_StagedIn)) {

		/* job has been checkpointed or files already staged in */
		/* in this case, exec_host must be already set	 	*/

		if (prun->rq_destin[0] != '\0') {

			/* specified destination must match exec_host */

			if (strcmp(prun->rq_destin, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str) != 0) {
				req_reject(PBSE_EXECTHERE, 0, preq);
				return (job *)0;
			}
		}
		if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HasNodes) == 0) {
			/* re-reserve nodes and leave exec_host as is */
			if ((rc = assign_hosts(pjob,
			   pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str,
			   0)) != 0) {
				req_reject(PBSE_EXECTHERE, 0, preq);
				return (job *)0;
			}
		}
	} else {

		/* job has not run before or need not run there again	*/
		/* reserve nodes and set exec_host anew			*/

		if (prun->rq_destin[0] == '\0') {
			rc = assign_hosts(pjob, 0, 1);
		} else {
			rc = assign_hosts(pjob, prun->rq_destin, 1);
		}

		if (rc != 0) {
			req_reject(rc, 0, preq);
			return (job *)0;
		}
	}
			
	return (pjob);
}

/*
 * assign_hosts - assign hosts (nodes) to job by the following rules:
 *	1. use nodes that are "given"; from exec_host when required by
 *		checkpoint-restart or file stage-in, or from run command.
 *	2. use nodes that match user's resource request.
 *	3. use default (local system or a single node).
 */

static int assign_hosts(pjob, given, set_exec_host)
	job  *pjob;
	char *given;
	int   set_exec_host;
{
	unsigned int	 dummy;
	char		*list = 0;
	char		*hosttoalloc;
	pbs_net_t	 momaddr = 0;
	resource	*pres;
	int		 rc = 0;
	extern char 	*mom_host;
	
	pres = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			       find_resc_def(svr_resc_def, "neednodes",
			       svr_resc_size) );

	if (given != 0) {	/* assign what was specified in run request */
		hosttoalloc = given;
		
	} else if (pres != 0) {	/* assign what was in "neednodes"	    */

		hosttoalloc = pres->rs_value.at_val.at_str;
		if (hosttoalloc == 0) 
			return (PBSE_UNKNODEATR);

	} else if (svr_totnodes == 0) {		/* assign "local"    */
		if ((server.sv_attr[(int)SVR_ATR_DefNode].at_flags & ATR_VFLAG_SET) && (server.sv_attr[(int)SVR_ATR_DefNode].at_val.at_str != 0)) {
			hosttoalloc = server.sv_attr[(int)SVR_ATR_DefNode].at_val.at_str;
		} else {
			hosttoalloc = mom_host;
			momaddr = pbs_mom_addr;
		}
	} else if ((server.sv_attr[(int)SVR_ATR_DefNode].at_flags & ATR_VFLAG_SET) && (server.sv_attr[(int)SVR_ATR_DefNode].at_val.at_str != 0)) {
		/* alloc what server's attribute default_node is set to */
		hosttoalloc = server.sv_attr[(int)SVR_ATR_DefNode].at_val.at_str;
	} else if (svr_tsnodes != 0) {
		/* find first time-shared node */
		if ((hosttoalloc = find_ts_node()) == NULL)
			return (PBSE_NOTSNODE);
	} else {
		/* fall back to 1 cluster node */
		hosttoalloc = PBS_DEFAULT_NODE;
	}


	/* do we need to allocate the (cluster) node(s)? */

	if (svr_totnodes != 0) {
		if ((rc = is_ts_node(hosttoalloc)) != 0) {
			rc = set_nodes(pjob, hosttoalloc, &list);
			set_exec_host = 1;	/* maybe new VPs, must set */
			hosttoalloc = list;
		}
	} 
	if (rc == 0) {
		if (set_exec_host) {
			job_attr_def[(int)JOB_ATR_exec_host].at_free(
				&pjob->ji_wattr[(int)JOB_ATR_exec_host]);
			(void)job_attr_def[(int)JOB_ATR_exec_host].at_decode(
				&pjob->ji_wattr[(int)JOB_ATR_exec_host],
				(char *)0,
				(char *)0,
				hosttoalloc);
			pjob->ji_modified = 1;
		} else {
			/* leave exec_host alone and reuse old IP address */
			momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;
		
			hosttoalloc = pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str;
		}
		(void)strncpy(pjob->ji_qs.ji_destin,
				parse_servername(hosttoalloc, &dummy),
				PBS_MAXROUTEDEST);
		if (momaddr == 0) {
			momaddr = get_hostaddr( pjob->ji_qs.ji_destin);
			if (momaddr == 0) {
				free_nodes(pjob);
				if (list)
					free(list);
				return (PBSE_BADHOST);
			}
		}
		pjob->ji_qs.ji_un.ji_exect.ji_momaddr = momaddr;
	}
	if (list != 0)
		free(list);
	return (rc);
}
