/* $Id: getrsrcs.c,v 1.1 1999/11/15 23:34:50 hender Exp $ */

/*
 *  Obtain resource information from the resource monitor and
 *  job counts from the server.
 */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <string.h>
#include <unistd.h>

/* PBS header files */
#include "pbs_error.h"
#include "pbs_ifl.h"
#include "log.h"
#include "resmon.h"
#include "rm.h"

/* Scheduler header files */
#include "toolkit.h"
#include "gblxvars.h"

#ifndef	GETRSRCS_CONNECT_TIME
#define	GETRSRCS_CONNECT_TIME	10	/* how long to wait for resource info */
#endif	/* ! GETRSRCS_CONNECT_TIME */

Resources *schd_RsrcsList = NULL;	/* List of hosts and their resources */

extern char *schd_CmdStr[16];

static void dump_resources(Resources *rsrcs);

/* ARGSUSED */
static void
connect_interrupt(int signo)
{
	/* 
	 * Do nothing.  Just accept the signal and allow an alarmed syscall 
	 * to be interrupted.
	 */
	return;
}

/*
 * Find an entry for the resources for the requested host in the list of
 * existing resources, or create a new one for that host and return it.
 */
Resources *
schd_get_resources(char *exechost)
{
    char   *id = "schd_get_resources";
    Resources *rptr, *new_rsrcs;
    int     rm;

    char   *response;
    int     badreply   = 0;
    int     cpus_avail = 0;
    int     cpus_tot   = 0;

    struct sigaction act, oact;

    unsigned int remain;	/* Time remaining in any old alarm(). */
    time_t then;		/* When this alarm() was started. */

    /*
     * Check for a local copy of the resources being available already.
     * If so, just return a reference to that Resources structure.
     */
    if (schd_RsrcsList != NULL) {
	for (rptr = schd_RsrcsList; rptr != NULL; rptr = rptr->next)
	    if (strcmp(rptr->exechost, exechost) == 0)
		return (rptr);
    }
	    
    schd_timestamp("get_rsrcs");

    /* 
     * No cached resource information for 'exechost'.  Need to query the
     * host for its information.
     */
    if ((new_rsrcs = (Resources *)malloc(sizeof(Resources))) == NULL) {
	(void)sprintf(log_buffer, "Unable to alloc space for Resources.");
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
	DBPRT(("%s: %s\n", id, log_buffer));

	return (NULL);	/* Can't get the information - nowhere to store it. */
    }

    memset((void *)new_rsrcs, 0, sizeof(Resources));

    act.sa_flags = 0;
    act.sa_handler = connect_interrupt;
    sigemptyset(&act.sa_mask);
    remain = 0;
    then = 0;

    /* 
     * Set the alarm, and maintain some idea of how long was left on any
     * previously set alarm.
     */
    if (sigaction(SIGALRM, &act, &oact) == 0) {
	remain = alarm(GETRSRCS_CONNECT_TIME);
	then = time(NULL);
    }

    if ((rm = openrm(exechost, 0)) == -1) {
	(void)sprintf(log_buffer,
	      "Unable to contact resmom@%s (%d)", exechost, pbs_errno);
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

	badreply = 1;
	goto bail;
    }

    /*
     * Turn off full response.  Responses will be received in the order in 
     * which they are sent.
     */
    fullresp(0);

    /* Build a list of all the resources about which we want information. */

    addreq(rm, "mppe_app");
    addreq(rm, "mppe_avail");

    /* Get the values back from the resource monitor, and round up. */

    /* Receive MPPE_APP response from resource monitor. */
    /* returns the total number of Application PEs configured */
    response = getreq(rm);
    if (response != NULL) {
	cpus_tot = atoi(response) * schd_FAKE_MACH_MULT;
    } else {
	(void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", 
		      pbs_errno, errno);
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
	badreply = 1;
	goto bail;
    }

    /* Receive MPPE_AVAIL response from resource monitor. */
    /* returns the largest contiguous block of APP PEs */
    response = getreq(rm);
    if (response != NULL) {
	cpus_avail = atoi(response) * schd_FAKE_MACH_MULT;
    } else {
	(void)sprintf(log_buffer, "bad return from getreq(ncpus), %d, %d", 
		      pbs_errno, errno);
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
	badreply = 1;
	goto bail;
    }
    new_rsrcs->freemem = MB_PER_NODE * schd_FAKE_MACH_MULT;

bail:
    /* Disconnect from the resource monitor. */
    if (rm)
	closerm(rm);

    /* And unset the alarm and handler. */
    alarm(0);
    sigaction(SIGALRM, &oact, &act);

    /* Reset the old alarm, taking into account how much time has passed. */
    if (remain) {
	DBPRT(("%s: old alarm had %d secs remaining, %d elapsed, ", id,
		remain, (time(NULL) - then)));
	/* How much time remains even after the time spent above? */
	remain -= (time(NULL) - then);

	/* 
	 * Would the previous time have already expired?  If so, schedule
	 * an alarm call in 1 second (close enough, hopefully).
	 */
	if (remain < 1)
	    remain = 1;

	DBPRT(("reset to %d secs\n", remain));
	alarm (remain);
    }

    /*
     * Verify all the data came back as expected; if not, abort this 
     * iteration of the scheduler.
     */
    if (badreply) {
	(void)sprintf(log_buffer, 
	    "Got bad info from mom@%s - aborting sched run", exechost);
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
	DBPRT(("%s: %s\n", id, log_buffer));

	free(new_rsrcs);
	return (NULL);
    }

    /* Make a copy of the hostname for the resources struct. */
    new_rsrcs->exechost = schd_strdup(exechost);
    if (new_rsrcs->exechost == NULL) {
	(void)sprintf(log_buffer, "Unable to copy exechost %s to rsrcs",
	    exechost);
	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);
	DBPRT(("%s: %s\n", id, log_buffer));

	free(new_rsrcs);
	return (NULL);
    }

    new_rsrcs->nodes_total = cpus_tot;
    new_rsrcs->nodes_alloc = cpus_tot - cpus_avail;

    if (schd_RsrcsList == NULL) {
	schd_RsrcsList  = new_rsrcs;	/* Start the list. */
    } else {
	for (rptr = schd_RsrcsList; rptr->next != NULL; rptr = rptr->next)
	    /* Find the last element in the list. */ ;
	rptr->next = new_rsrcs;
    }
    /* Next pointer for the tail of the list points to nothing. */
    new_rsrcs->next = NULL;

    return (new_rsrcs);
}

void
schd_dump_rsrclist(void)
{
    Resources *rsrcs;

    for (rsrcs = schd_RsrcsList; rsrcs != NULL; rsrcs = rsrcs->next)
	dump_resources(rsrcs);
}

static void
dump_resources(Resources *rsrcs)
{
    char   *id = "dump_resources";

    /* Log the system's status */

    (void)sprintf(log_buffer,
		  "Resources for host %s", rsrcs->exechost);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %s", "Memory (free):",
	schd_byte2val(rsrcs->freemem));
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %d / %d (%.2f%% utilization)", 
	"Nodes allocated:", rsrcs->nodes_alloc, rsrcs->nodes_total,
	(rsrcs->nodes_alloc * 100.0) / rsrcs->nodes_total);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);

    (void)sprintf(log_buffer, " :: %-24s = %d", "Running jobs:",
	rsrcs->njobs);
    log_record(PBSEVENT_SYSTEM,PBS_EVENTCLASS_SERVER,id,log_buffer);
}
