##############################################################################
#
#  As of Mon Jul  1 18:59:35 CDT 1996, now places more importance on walltime
#  than cput.
#
#  02/14/96 - modify sorting to sort by walltime/job-id
#
#  Jobs are sorted by walltime and placed into the following categories:
#     SMALL  0-4 hours walltime
#     MED    4-8 hours walltime
#     LARGE  8-24 hours walltime
#     HUGE   24+ hours walltime
#
#  Two jobs from each category are allowed to run at any instance of time
#  subject to a maximum number of jobs.  If a category is full and there
#  is a job waiting to run in that category, the job may "float up" to 
#  a larger category (ie: if there are 2 small jobs running, a waiting small
#  job may be run as a medium, large, or huge job.
#
#  If a job wants lots of memory (max_ram), the job may only run
#  at certain times.
#
#  XXX check error returns
#  XXX datetime blows up when >99 hrs are specified
#
#	procedures:
#		compare  --- to be used with lsort
#		tobytes  --- converts to bytes
#
#	Special files:
#		debug.dat   --- info for debuging the program
#
#
##############################################################################
#
# Debug file handle

#...Read the value of the debugging mode from the 'sched_mode' file
#......Debugging mode = 1 		Silent    mode = 0
#
set debug 1

set MAX_RUNNING_JOBS 8

set SMALL_JOB 0
set MED_JOB   1
set LARGE_JOB 2 
set HUGE_JOB  3


set MAX_SMALL_JOBS 2
set MAX_MED_JOBS   2
set MAX_LARGE_JOBS 2
set MAX_HUGE_JOBS  2

set SMALL_JOB_WALLTIME  [datetime "04:00:00"]
set MED_JOB_WALLTIME    [datetime "08:00:00"]
set LARGE_JOB_WALLTIME  [datetime "24:00:00"]

set now [datetime]
set ymd   [strftime "%Y%m%d" $now]
if { $debug == 1} {
   set db [open /usr/spool/pbs/sched_priv/debug/$ymd a]
}

#
#
# Keep 32MB of virtual memory free at all times
set memory_overhead	[ expr 32*1048576 ] 
# Maximum RAM usage by any one job during: 800MB
set max_ram		[ expr 800*1048576 ]	
# Above this load average, we will defer all jobs
set max_load_ave		20
#
#####################################################################
#
#	Procedure compare: to be used with lsort for a list {count name}
#                           sorts with list element 0
#                           if element 0 equal, sort by element 1
#
#####################################################################
#
proc compare {a b} {
   set first [lindex $a 0]
   set two   [lindex $b 0]

   puts "in compare"
   if { $first > $two } {
 	return 1
   } else {
      if { $first < $two } {
         return -1
      } else {
         set first [lindex $a 1]
         set two   [lindex $b 1]
         if { $first > $two } {
            return 1
         } else {
            if { $first < $two } {
               return -1
            }
            else {
               return 0
            }
         }
      }
   }
   puts "out compare" 
}

#####################################################################
#
#     convert to bytes 	
#
#####################################################################
proc tobytes {size} {
# XXX this proc needs support for words, not only bytes
	set csize [ string trim $size b ]
	set unit "b"
	regexp {[a-zA-Z]+$} $csize unit
	set unit [string index [ string tolower $unit ] 0 ]
	set number 0
	regexp {^[0-9]+} $csize number

	if {$unit == "m"} {
		set number [ expr $number*1048576 ]
	} elseif {$unit == "k"} {
		set number [ expr $number*1024 ]
	} elseif {$unit == "g"} {
		set number [ expr $number*1073741824 ]
	} 

	if {$number < 0} {
		set number 2147483647
	}

	return $number
	
}
	

######################################################################
#
# **** INITIALIZATION PHASE
#
# Initialized data array for usage statisctics
#
#
######################################################################
#
if { $debug == 1 } {
   puts "== INITIALIZATION PHASE"
}

puts $db "== INITIALIZATION PHASE"

#
# get rid of previous values so we don't use up memory
if {[info exists user]} {unset user}
if {[info exists curr_running]} {unset curr_running}
if {[info exists numRunning]} {unset numRunning}
if {[info exists numRunningJobs]} {unset numRunningJobs}
if {[info exists WaitingJobs]} {unset WaitingJobs}
#
#
#...Open connection to the resource monitor
#
set host_con [openrm shiva.missouri.edu]
if {$host_con < 0} {
	puts $db "bad return from openrm"
	return
}

#########################################################################
#
# ***** MONITOR PHASE
#
#
# Loop through all currently running jobs 
# We determine who is exceeding their resource limits this way.
#
#########################################################################
puts $db "== MONITOR PHASE"
if { $debug == 1 } {
   puts "== MONITOR PHASE"
}

# initialize number of jobs and job sizes in each queue
set queues [pbsstatque]
foreach qq $queues {
   set qname [lindex $qq 0]
   set numRunning($qname,$SMALL_JOB) 0
   set numRunning($qname,$MED_JOB)   0
   set numRunning($qname,$LARGE_JOB) 0
   set numRunning($qname,$HUGE_JOB)  0

   set numRunningJobs($qname) 0
}


set runningjobs [pbsstatjob]

# puts $db $runningjobs

foreach job $runningjobs {
	set jid [lindex $job 0]
	set attrls [lindex $job 1]

        # Only currently running jobs, now
	set cnt [lsearch $attrls {job_state *}]
	set attrl [ lindex $attrls $cnt]
	set job_state [lindex $attrl 1]
	puts $db "job state for $jid is $job_state"
	if {$job_state != "R"} { 
		continue
	}

        # Get job owner    
	set cnt [lsearch $attrls {euser *}]
	set attrl [ lindex $attrls $cnt]
	set job_owner [lindex $attrl 1]

        # Keep track of number of jobs by owner
	if {[info exists curr_running($job_owner)]} { 
		incr curr_running($job_owner)
		puts $db "$job_owner: curr running incremented to $curr_running($job_owner)"
	} else {
		set curr_running($job_owner) 1
		puts $db "$job_owner: curr running set to one"
	}

        #..Get the amount of wallclock time that the job kicked off with
	set cnt [lsearch $attrls {Resource_List.walltime *}]
	set attrl [ lindex $attrls $cnt]
	set walltime_orig [datetime [lindex $attrl 1]]

        #..Get the queue the job is running in
        set cnt [lsearch $attrls {queue *}]
        set attrl [ lindex $attrls $cnt ]
        set qname [ lindex $attrl 1 ]
   
        #..increment type of job running by its walltime
        if { $walltime_orig <= $SMALL_JOB_WALLTIME } {
           incr numRunning($qname,$SMALL_JOB)
        } elseif { $walltime_orig <= $MED_JOB_WALLTIME } {
           incr numRunning($qname,$MED_JOB)
        } elseif { $walltime_orig <= $LARGE_JOB_WALLTIME } {
           incr numRunning($qname,$LARGE_JOB)
        } else {
           incr numRunning($qname,$HUGE_JOB)
        }
        incr numRunningJobs($qname)


        #..Get the amount of memory that the job kicked off requesting 
	set cnt [lsearch $attrls {Resource_List.mem *}]
	set attrl [ lindex $attrls $cnt]
	set req_mem($jid) [ tobytes [lindex $attrl 1]]

}

set queues [pbsstatque]
foreach qq $queues {
   set qname [lindex $qq 0]
   #..take into account small job using medium slot and so forth
   if {$numRunning($qname,$SMALL_JOB) > $MAX_SMALL_JOBS} {
      set i [expr $numRunning($qname,$SMALL_JOB) - $MAX_SMALL_JOBS]
      set numRunning($qname,$SMALL_JOB) $MAX_SMALL_JOBS
      incr numRunning($qname,$MED_JOB) $i
   }

   if {$numRunning($qname,$MED_JOB) > $MAX_MED_JOBS} {
      set i [expr $numRunning($qname,$MED_JOB) - $MAX_MED_JOBS]
      set numRunning($qname,$MED_JOB) $MAX_MED_JOBS
      incr numRunning($qname,$LARGE_JOB) $i
   }

   if {$numRunning($qname,$LARGE_JOB) > $MAX_LARGE_JOBS} {
      set i [expr $numRunning($qname,$LARGE_JOB) - $MAX_LARGE_JOBS]
      set numRunning($qname,$LARGE_JOB) $MAX_LARGE_JOBS
      incr numRunning($qname,$HUGE_JOB) $i
   }

   if { $debug } {
      puts $db "in queue $qname the jobs are:"
      puts $db "Number of Running Small Jobs: $numRunning($qname,$SMALL_JOB)"
      puts $db "Number of Running Med   Jobs: $numRunning($qname,$MED_JOB)"
      puts $db "Number of Running Large Jobs: $numRunning($qname,$LARGE_JOB)"
      puts $db "Number of Running Huge  Jobs: $numRunning($qname,$HUGE_JOB)"
   }	
}

#########################################################################
#
# KICK-1 PHASE
#
# Select all the runnable jobs
# Loop through all the jobs to find out how many jobs each user have
#
# Variables:
#  jobs    --- list of all runnable jobs
#
#########################################################################
#
puts $db "== KICK-1 PHASE"
if { $debug == 1 } {
   puts "== KICK-1 PHASE"
}

set jobs [pbsselstat]

set WaitingJobs ""

foreach job $jobs {
        #..Get the job id number and the attributes for the job
	set jid [lindex $job 0]
	set attrls [lindex $job 1]

        #..Get the user of this job
	set cnt [lsearch $attrls {euser *}]
	set attrl [ lindex $attrls $cnt]
	set user($jid) [lindex $attrl 1]

        #..Get the queue of this job
	set cnt [lsearch $attrls {queue *}]
	set attrl [ lindex $attrls $cnt]
	set queue($jid) [lindex $attrl 1]

        #..Get the requested cput
 	set cnt [lsearch $attrls {Resource_List.cput *}]
	set attrl [ lindex $attrls $cnt]
	set temp [lindex $attrl 1]

        #..if requested cpu time empty
	if { $temp == "" } {
           set req_cput($jid) 01:00:00
	   puts $db "*** WE SHOULD NEVER BE HERE (req_cput was empty!)"
        } else {
           set req_cput($jid) $temp
	}

        #..Get the requested pcput, non-array
        set cnt [lsearch $attrls {Resource_List.pcput *}]
        set attrl [ lindex $attrls $cnt]
        set pcput [lindex $attrl 1]

        #..If pcput is unacceptable then make it acceptable
	if { $pcput > $req_cput($jid) || $pcput == "" } {
	   pbsalterjob $jid "{ Resource_List pcput $req_cput($jid) }"
	   if { $debug } {
	      puts $db "job pcput forced to $req_cput($jid) (was $pcput )"
	   }
	}

        #..Get the requested pmem, non-array
        set cnt [lsearch $attrls {Resource_List.pmem *}]
        set attrl [ lindex $attrls $cnt]
        set pmem [tobytes [lindex $attrl 1] ]

        #..place the requested memory in a temporary variable
 	set cnt [lsearch $attrls {Resource_List.mem *}]
	set attrl [ lindex $attrls $cnt]
        set temp [lindex $attrl 1]

        #..if memory not specified
	if { $temp == "" } {
           # alter job and set memory to 64meg
           set req_mem($jid) [ tobytes 64m ]
	   set pmem ""
	   pbsalterjob $jid "{ Resource_List mem 64m }"
	   if { $debug } {
	      puts $db "no default memory footprint specified; 64MB used"
           }	
        } else {
           # convert memory request to bytes
           set req_mem($jid) [ tobytes $temp ]
	}
    
        # If pmem is unacceptable then make it acceptable
        if { $pmem > $req_mem($jid) || $pmem == "" } {
           pbsalterjob $jid "{ Resource_List pmem $req_mem($jid) }"
           if { $debug } {      
              puts $db "job pmem forced to $req_mem($jid) (was $pmem)"
           }
        }   
	
        #..Get the requested walltime
        set cnt [lsearch $attrls {Resource_List.walltime *}]
        set attrl [ lindex $attrls $cnt]
        set req_walltime($jid) [lindex $attrl 1]

        if { $req_walltime($jid) == "" } {
           set req_walltime($jid) "01:00:00"
           puts $db "requested walltime forced to 01:00:00 from nothing"
        }

        #..place job id, walltime, and memory in list of Waiting Jobs
        set temp [list [datetime $req_walltime($jid)] $jid]
        if { $WaitingJobs == "" } {
           set WaitingJobs [list $temp]
        } else {
           lappend WaitingJobs $temp
        }

        # XXX Get the size of the largest single file that can be created.
        set cnt [lsearch $attrls {Resource_List.file *}]
        set attrl [ lindex $attrls $cnt]
        set req_file($jid) [tobytes [lindex $attrl 1] ]

	puts $db "Requested file size = $req_file($jid)"

	if { $debug } { 
           puts $db "--- [strftime "%D" $now]  $hour ---" 
	   puts $db "    job: $jid     user:$user($jid)    queue:$queue($jid)"
	   puts $db "Requested Resources: cput=$req_cput($jid) mem=$req_mem($jid) wt=$req_walltime($jid) " 
	}

}

##########################################################################

#
# KICK-2 PHASE
#
# sort waiting jobs by walltime
#
##########################################################################
#
puts $db "== KICK-2 PHASE"
if { $debug == 1 } {
   puts "== KICK-2 PHASE"
}

if {$debug != 0} {
      puts $db "Waiting jobs before sorting are:"
      foreach job $WaitingJobs {
         set jobnum [lindex $job 1]
         set jobtime [lindex $job 0]
         puts $db "  $jobnum $jobtime "
      }
}

# sort the jobs waiting in the queue by walltime
set WaitingJobs [lsort -command compare $WaitingJobs]

if {$debug != 0} {
      puts $db "Waiting jobs after sorting are:"
      foreach job $WaitingJobs {
         set jobnum [lindex $job 1]
         set jobtime [lindex $job 0]
         puts $db "  $jobnum $jobtime "
      }
}

#
#..Open the Status file which will contain the PBS status of all the queued jobs
#
if { $debug = 1 } {
   set f [open /usr/spool/pbs/sched_priv/pbs.status w ]
}

############################################################################ 
# GONOGO PHASE
#
############################################################################ 
#
puts $db "== GONOGO PHASE"
if { $debug == 1 } {
   puts "== GONOGO PHASE"
}
 
set run_job 1
set defer_job 2
set hold_job 3

if {$WaitingJobs == ""} {
   puts "no jobs waiting in the queue" 
}
foreach job $WaitingJobs {
        set jid [lindex $job 1]
     
        if { $debug != 0 } {
           puts $db "checking to perhaps run $jid"
        }
	# default action is to run the job
	set jobaction $defer_job
	set jobreason "No room in queue $queue($jid) for job with walltime: $req_walltime($jid)"

        #..convert wall time to seconds
        set SecWallTime [datetime $req_walltime($jid)]
 
        # if this is a small job and space for small jobs
        if { $SecWallTime <= $SMALL_JOB_WALLTIME \
          && $numRunning($queue($jid),$SMALL_JOB) < $MAX_SMALL_JOBS } {
           set jobsize $SMALL_JOB
           set jobaction $run_job
        } elseif { $SecWallTime <= $MED_JOB_WALLTIME \
          && $numRunning($queue($jid),$MED_JOB) < $MAX_MED_JOBS } {
           set jobsize $MED_JOB
           set jobaction $run_job
        } elseif { $SecWallTime <= $LARGE_JOB_WALLTIME\
          && $numRunning($queue($jid),$LARGE_JOB) < $MAX_LARGE_JOBS } {
           set jobsize $LARGE_JOB
           set jobaction $run_job
        } elseif { $numRunning($queue($jid),$HUGE_JOB) < $MAX_HUGE_JOBS } {
           set jobsize $HUGE_JOB
           set jobaction $run_job
        }

	# get queue statistics
	set queues [pbsstatque]

	# make sure that the queue has been started before running the job
	foreach qq $queues {
		set qname [lindex $qq 0]
		set attrls [lindex $qq 1]

		if { $qname == $queue($jid) } {
	           set cnt [lsearch $attrls {started *}]
		   set attrl [ lindex $attrls $cnt]
		   set started [lindex $attrl 1]
		   # Is the queue started?
		   if { $started != "True" } {
		      set jobaction $defer_job
		      set jobreason "queue $qname not started"
	           }
                   set cnt [lsearch $attrls {max_user_run *}]
                   set attrl [ lindex $attrls $cnt]
                   set max_user_run [lindex $attrl 1]
	           puts $db "-0-: queue max_run limit is $max_user_run"
		   # Is the user running too many jobs at once?
		   if {[info exists curr_running($user($jid))]} {
		      puts $db "-1-: curr_running for this user exists and is $curr_running($user($jid))"
   		      if { $curr_running($user($jid)) >= $max_user_run } {
		         set jobaction $defer_job
			 set jobreason "user $user($jid) already has $curr_running($user($jid)) jobs running (queue max is $max_user_run)"
		      }
		   }
		}
	}

	# get machine load statistics
	   addreq $host_con "loadave"
	   addreq $host_con "availmem"
	   set load_average [getreq $host_con]
	   set free_memory [ getreq $host_con ]
     
	if { $free_memory < 0 } { 
		set free_memory 2147483647
	}

	puts $db "loadave = $load_average free_memory = $free_memory" 

	# if the job wants more memory than we have, well ....
	if { [ expr $req_mem($jid)+$memory_overhead ] > $free_memory } {
		set jobaction $defer_job
		set jobreason "$jid wants $req_mem($jid), only $free_memory RAM bytes free"
	}

	# XXX if our load average is ungodly, defer some jobs
	if { $load_average >= $max_load_ave } {
		set jobaction $defer_job
		set jobreason "$jid: load average is too high ($load_average >= $max_load_ave)"
	}

        # if job wants more than 800mb memory
        if { $req_mem($jid) > $max_ram } {
           set jobaction $defer_job
           set jobreason "job $jid wants too much memory $req_mem($jid) "
        }

	#
	# Commence scheduling
	#

        #..Here's where we rock n' roll
	if { $jobaction == $defer_job } {
		puts $db "*** Deferring job $jid"
		puts $db "*** reason: $jobreason"
	} elseif { $jobaction == $run_job } {
           if { $numRunningJobs($queue($jid)) >= $MAX_RUNNING_JOBS } {
              puts $db "*** Deferring job $jid"
              puts $db "*** reason: queue $queue($jid) is full";
           } else {
              if { $debug } {
	         puts $db "*** Running job $jid"
              }	
	         pbsrunjob $jid
              incr numRunning($queue($jid),$jobsize)
              incr numRunningJobs($queue($jid))

              if {[info exists curr_running($user($jid))]} {
                 incr curr_running($user($jid))
                 puts $db "$user($jid): curr running incremented to $curr_running($user($jid))"
              } else {
                 set curr_running($user($jid)) 1
                 puts $db "$user($jid): curr running set to one"
              }

           }
	}
}
#
if { $debug } {
   set queues [pbsstatque]
   foreach qq $queues {
      set qname [lindex $qq 0]
      puts $db "After queue submissions in queue $qname:"
      puts $db "total number of jobs is $numRunningJobs($qname)"
      puts $db "Number of Running Small Jobs: $numRunning($qname,$SMALL_JOB)"
      puts $db "Number of Running Med   Jobs: $numRunning($qname,$MED_JOB)"
      puts $db "Number of Running Large Jobs: $numRunning($qname,$LARGE_JOB)"
      puts $db "Number of Running Huge  Jobs: $numRunning($qname,$HUGE_JOB)"
   }
}	
# Close the PBS status log and debuging files
puts $db "=================================================================="
close $db
close $f
    
# Close connection to the Resource Monitor
closerm $host_con
