/*
*         Portable Batch System (PBS) Software License
* 
* Copyright (c) 1999, MRJ Technology Solutions.
* All rights reserved.
* 
* Acknowledgment: The Portable Batch System Software was originally developed
* as a joint project between the Numerical Aerospace Simulation (NAS) Systems
* Division of NASA Ames Research Center and the National Energy Research
* Supercomputer Center (NERSC) of Lawrence Livermore National Laboratory.
* 
* Redistribution of the Portable Batch System Software and use in source
* and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
* 
* - Redistributions of source code must retain the above copyright and
*   acknowledgment notices, this list of conditions and the following
*   disclaimer.
* 
* - Redistributions in binary form must reproduce the above copyright and 
*   acknowledgment notices, this list of conditions and the following
*   disclaimer in the documentation and/or other materials provided with the
*   distribution.
* 
* - All advertising materials mentioning features or use of this software must
*   display the following acknowledgment:
* 
*   This product includes software developed by NASA Ames Research Center,
*   Lawrence Livermore National Laboratory, and MRJ Technology Solutions.
* 
*         DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED BY MRJ TECHNOLOGY SOLUTIONS ("MRJ") "AS IS" WITHOUT 
* WARRANTY OF ANY KIND, AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, SHALL MRJ, NASA, NOR
* THE U.S. GOVERNMENT BE LIABLE FOR ANY DIRECT DAMAGES WHATSOEVER,
* NOR ANY INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
/*
 * req_shutdown.c - contains the functions to shutdown the server
 */
#include <pbs_config.h>   /* the master config generated by configure */

#include <sys/types.h>
#include "libpbs.h"
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include "server_limits.h"
#include "list_link.h"
#include "work_task.h"
#include "log.h"
#include "attribute.h"
#include "server.h"
#include "credential.h"
#include "batch_request.h"
#include "job.h"
#include "queue.h"
#include "pbs_error.h"
#include "svrfunc.h"

static char ident[] = "@(#) $RCSfile: req_shutdown.c,v $ $Revision: 2.1 $";

/* Private Fuctions Local to this File */

static int shutdown_chkpt A_((job *));
static void post_chkpt A_((struct work_task *));
static void rerun_or_kill A_((job *, char *text));

/* Private Data Items */

static struct batch_request *pshutdown_request = 0;

/* Global Data Items: */

extern list_head svr_alljobs;
extern char *msg_abort_on_shutdown;
extern char *msg_daemonname;
extern char *msg_init_queued;
extern char *msg_shutdown_op;
extern char *msg_shutdown_start;
extern char *msg_leftrunning;
extern char *msg_stillrunning;
extern char *msg_on_shutdown;
extern char *msg_job_abort;

extern list_head task_list_event;
extern struct server server;
extern attribute_def svr_attr_def[];


/*
 * svr_shutdown() - Perform (or start of) the shutdown of the server
 */

void svr_shutdown(type)
	int type;
{
	attribute	  *pattr;
	job		  *pjob;
	job		  *pnxt;
	long		 *state;

	/* Lets start by logging shutdown and saving everything */

	state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long;
	(void)strcpy(log_buffer, msg_shutdown_start);

	if (*state == SV_STATE_SHUTIMM) {

		/* if already shuting down, another Immed/sig will force it */

		if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) {
			*state = SV_STATE_DOWN;
			(void)strcat(log_buffer, "Forced");
			log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		  		  PBS_EVENTCLASS_SERVER, msg_daemonname,
				  log_buffer);
			return;
		}
	}

	if (type == SHUT_IMMEDIATE) {
		*state = SV_STATE_SHUTIMM;
		(void)strcat(log_buffer, "Immediate");

	} else if (type == SHUT_DELAY) {
		*state = SV_STATE_SHUTDEL;
		(void)strcat(log_buffer, "Delayed");

	} else if (type == SHUT_QUICK) {
		*state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
		(void)strcat(log_buffer, "Quick");

	} else {
		*state = SV_STATE_SHUTIMM;
		(void)strcat(log_buffer, "By Signal");
	}
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		  PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer);

	if (type == SHUT_QUICK) /* quick, leave jobs as are */
		return;

	svr_save(&server, SVR_SAVE_QUICK);

	pnxt = (job *)GET_NEXT(svr_alljobs);
	while ((pjob = pnxt) != (job *)0) {
	    pnxt = (job *)GET_NEXT(pjob->ji_alljobs);

	    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {

		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART|JOB_SVFLG_HASRUN;
		pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt];
		if ( (pattr->at_val.at_str) &&
		      (*pattr->at_val.at_str != 'n') ) {
			/* do checkpoint of job */

			if (shutdown_chkpt(pjob) == 0)
				continue;
		}

		/* if not checkpoint (not supported, not allowed, or fails */
		/* rerun if possible, else kill job			   */

		rerun_or_kill(pjob, msg_on_shutdown);
	    }
	}
	return;
}

/*
 * shutdown_ack - acknowledge the shutdown (terminate) request
 * 	if there is one.  This is about the last thing the server does
 *	before going away.
 */

void shutdown_ack()
{
	if (pshutdown_request) {
		reply_ack(pshutdown_request);
		pshutdown_request = 0;
	}
}

/*
 * req_shutdown - process request to shutdown the server.
 *
 *	Must have operator or administrator privilege.
 */

void req_shutdown(preq)
	struct batch_request *preq;
{
	if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_MGRD | ATR_DFLAG_OPRD |
	     ATR_DFLAG_OPWR)) == 0) {
		req_reject(PBSE_PERM, 0, preq);
		return;
	}

	(void)sprintf(log_buffer, msg_shutdown_op,preq->rq_user,preq->rq_host);
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		  PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer);

	pshutdown_request = preq;    /* save for reply from main() when done */

	svr_shutdown(preq->rq_ind.rq_shutdown);
	return;
}

/*
 * shutdown_chkpt - perform checkpoint of job by issuing a hold request to mom
 */

static int shutdown_chkpt(pjob)
	job *pjob;
{
	struct batch_request *phold;
	attribute 	      temp;

	phold = alloc_br(PBS_BATCH_HoldJob);
	if (phold == (struct batch_request *)0) 
		return (PBSE_SYSTEM);

	temp.at_flags = ATR_VFLAG_SET;
	temp.at_type  = job_attr_def[(int)JOB_ATR_hold].at_type;
	temp.at_val.at_long = HOLD_s;

	phold->rq_perm = ATR_DFLAG_MGRD | ATR_DFLAG_MGWR;
	(void)strcpy(phold->rq_ind.rq_hold.rq_orig.rq_objname, pjob->ji_qs.ji_jobid);
	CLEAR_HEAD(phold->rq_ind.rq_hold.rq_orig.rq_attr);
	if (job_attr_def[(int)JOB_ATR_hold].at_encode(&temp,
				&phold->rq_ind.rq_hold.rq_orig.rq_attr,
				job_attr_def[(int)JOB_ATR_hold].at_name,
				(char *)0,
				ATR_ENCODE_CLIENT) < 0)
		return (PBSE_SYSTEM);

	if ( relay_to_mom(pjob->ji_qs.ji_un.ji_exect.ji_momaddr, phold, post_chkpt) == 0) {
		pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
		pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN | JOB_SVFLG_CHKPT;
		(void)job_save(pjob, SAVEJOB_QUICK);
		return (0);
	} else
		return (-1);
}

/*
 * post-chkpt - clean up after shutdown_chkpt
 *	This is called on the reply from MOM to a Hold request made in
 *	shutdown_chkpt().  If the request succeeded, then record in job.
 *	If the request failed, then we fall back to rerunning or aborting
 *	the job.
 */

static void post_chkpt(ptask)
	struct work_task *ptask;
{
	job		     *pjob;
	struct batch_request *preq;
	int		      state;
	int		      subst;

	preq = (struct batch_request *)ptask->wt_parm1;
	pjob = find_job(preq->rq_ind.rq_hold.rq_orig.rq_objname);
	if (preq->rq_reply.brp_code == 0) {
		/* checkpointed ok */
		if (preq->rq_reply.brp_auxcode)	/* chkpt can be moved */
		    pjob->ji_qs.ji_svrflags =
				(pjob->ji_qs.ji_svrflags & ~JOB_SVFLG_CHKPT) |
				JOB_SVFLG_HASRUN | JOB_SVFLG_ChkptMig;
	
	} else {
		/* need to try rerun if possible or just abort the job */

		if (pjob) {
		    pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHKPT;
		    pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
		    if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
			rerun_or_kill(pjob, msg_on_shutdown);
		}
	}

	release_req(ptask);
}

static void rerun_or_kill(pjob, text)
	job  *pjob;
	char *text;
{
	long server_state = server.sv_attr[(int)SRV_ATR_State].at_val.at_long;

	if (pjob->ji_wattr[(int)JOB_ATR_rerunable].at_val.at_long) {

		/* job is rerunable, mark it to be requeued */

		(void)issue_signal(pjob, "SIGKILL", release_req, 0);
		pjob->ji_qs.ji_substate  = JOB_SUBSTATE_RERUN;
		(void)strcpy(log_buffer, msg_init_queued);
		(void)strcat(log_buffer, pjob->ji_qhdr->qu_qs.qu_name);
		(void)strcat(log_buffer, text);
	} else if (server_state != SV_STATE_SHUTDEL) {

		/* job not rerunable, immediate shutdown - kill it off */

		(void)strcpy(log_buffer, msg_job_abort);
		(void)strcat(log_buffer, text);
		/* need to record log message before purging job */
		log_event(PBSEVENT_SYSTEM|PBSEVENT_JOB|PBSEVENT_DEBUG,
			  PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
			  log_buffer);
		(void)job_abt(pjob, log_buffer);
		return;
	} else {

		/* delayed shutdown, leave job running */

		(void)strcpy(log_buffer, msg_leftrunning);
		(void)strcat(log_buffer, text);
	}
	log_event(PBSEVENT_SYSTEM|PBSEVENT_JOB|PBSEVENT_DEBUG,
		  PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid,
		  log_buffer);
}
