/usr/lib/oar/oar_meta_sched is in oar-server 2.5.2-4.1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | #!/usr/bin/perl
# $Id$
use strict;
use warnings;
use DBI();
use OAR::IO;
use OAR::Modules::Judas qw(oar_debug oar_warn oar_error set_current_log_category);
use OAR::Conf qw(init_conf dump_conf get_conf is_conf get_conf_with_default_param);
use Data::Dumper;
use OAR::Schedulers::Scheduler;
use OAR::Tools;
use OAR::Modules::Hulot;
# Log category
set_current_log_category('scheduler');
my $exit_code = 0;
my $base = OAR::IO::connect();
my $base_ro = OAR::IO::connect_ro();
init_conf($ENV{OARCONFFILE});
my $security_time_overhead = 60;
if (is_conf("SCHEDULER_JOB_SECURITY_TIME")){
$security_time_overhead = get_conf("SCHEDULER_JOB_SECURITY_TIME");
}
my $minimum_hole_time = 0;
if (is_conf("SCHEDULER_GANTT_HOLE_MINIMUM_TIME")){
$minimum_hole_time = get_conf("SCHEDULER_GANTT_HOLE_MINIMUM_TIME");
}
my $Order_part = get_conf("SCHEDULER_RESOURCE_ORDER");
my $binpath;
if (defined($ENV{OARDIR})){
$binpath = $ENV{OARDIR}."/";
}else{
die("[MetaSched] OARDIR env variable must be defined\n");
}
my $Cpuset_field = get_conf("JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD");
my $Remote_host = get_conf("SERVER_HOSTNAME");
my $Remote_port = get_conf("SERVER_PORT");
oar_debug("[MetaSched] Start of meta scheduler\n");
OAR::Schedulers::Scheduler::init_scheduler($base,$base_ro,$security_time_overhead,$minimum_hole_time,$Order_part,get_conf("RESERVATION_WAITING_RESOURCES_TIMEOUT"));
my %initial_time = OAR::Schedulers::Scheduler::get_initial_time();
my @queues = OAR::IO::get_active_queues($base);
my $name;
my $policy;
foreach my $i (@queues){
$name = $i->[0];
$policy = $i->[1];
my $waiting_jobs = OAR::IO::is_waiting_job_specific_queue_present($base,$name);
if ($waiting_jobs == 1){
oar_debug("[MetaSched] Launching scheduler $name with $policy at time $initial_time{sql}\n");
my ($sched_exit_code,$sched_signal_num,$sched_dumped_core) = OAR::Tools::launch_command("$binpath/schedulers/$policy $name $initial_time{sec} \"$initial_time{sql}\"");
if (($sched_signal_num != 0) || ($sched_dumped_core != 0)){
oar_error("[MetaSched] Something wrong occured, we inactive the queue $name (signal or core dumped or '$binpath/schedulers/$policy' cannot be executed)!!! Look at 'oarnotify' if you want to change queue settings.\n");
OAR::IO::stop_a_queue($base,$name);
}
if ($sched_exit_code == 1){
$exit_code = 1;
}elsif($sched_exit_code != 0){
oar_error("[MetaSched] Scheduler $binpath/schedulers/$policy on queue $name at time $initial_time{sec} returns a bad value : $sched_exit_code. So this queue will be disabled (look at 'oarnotify' if you want to change queue settings).\n");
OAR::IO::stop_a_queue($base,$name);
#$exit_code = 3;
}
if (OAR::Schedulers::Scheduler::treate_waiting_reservation_jobs($base,$name) != 0){
$exit_code = 1;
}
if (OAR::Schedulers::Scheduler::check_reservation_jobs($base,$base_ro,$name,$Order_part) != 0){
$exit_code = 1;
}
}else{
oar_debug("[MetaSched] No waiting job in $name queue\n");
}
}
if ($exit_code == 0){
if (OAR::Schedulers::Scheduler::check_jobs_to_kill($base) == 1){
# We must kill besteffort jobs
OAR::Tools::notify_tcp_socket($Remote_host,$Remote_port,"ChState");
$exit_code = 2;
}elsif (OAR::Schedulers::Scheduler::check_jobs_to_launch($base) == 1){
$exit_code = 1;
}
}
#Update visu gantt tables
OAR::Schedulers::Scheduler::update_gantt_visu_tables($base);
OAR::IO::disconnect($base_ro);
# Manage dynamic node feature
my $timeout_cmd = 10;
my $flagHulot=0;
if (is_conf("SCHEDULER_TIMEOUT")){
$timeout_cmd = get_conf("SCHEDULER_TIMEOUT");
}
if ((is_conf("SCHEDULER_NODE_MANAGER_SLEEP_CMD")
or (get_conf("ENERGY_SAVING_INTERNAL") eq "yes" and is_conf("ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD")))
and is_conf("SCHEDULER_NODE_MANAGER_SLEEP_TIME") and is_conf("SCHEDULER_NODE_MANAGER_IDLE_TIME")){
my @node_halt= OAR::Schedulers::Scheduler::get_idle_nodes(
$base,
get_conf("SCHEDULER_NODE_MANAGER_IDLE_TIME"),
get_conf("SCHEDULER_NODE_MANAGER_SLEEP_TIME")
);
if ($#node_halt >= 0){
oar_debug("[MetaSched] Some nodes can be halted : @node_halt\n");
# Using the built-in energy saving module to shut down nodes
if (get_conf_with_default_param("ENERGY_SAVING_INTERNAL", "no") eq "yes") {
if (OAR::Modules::Hulot::halt_nodes(\@node_halt) ) {
oar_error("[MetaSched] Communication problem with Hulot (the energy saving module)!\n");
}
$flagHulot=1;
}
# Not using the built-in energy saving module to shut down nodes
else {
my $cmd = get_conf("SCHEDULER_NODE_MANAGER_SLEEP_CMD");
if (! defined(OAR::Tools::fork_and_feed_stdin($cmd, $timeout_cmd, \@node_halt))){
oar_error("[MetaSched] Try to launch the command $cmd to stop some nodes but the command timeouted($timeout_cmd s).\n");
}
oar_debug("[MetaSched] @node_halt should be shutting down now.\n");
}
}
}
if (is_conf("SCHEDULER_NODE_MANAGER_WAKE_UP_CMD") or (get_conf("ENERGY_SAVING_INTERNAL") eq "yes" and is_conf("ENERGY_SAVING_NODE_MANAGER_WAKE_UP_CMD"))){
# Wake up right nodes
my @nodes = OAR::Schedulers::Scheduler::get_nodes_to_wake_up($base);
if ($#nodes >= 0){
oar_debug("[MetaSched] Some nodes must be started : @nodes\n");
# Using the built-in energy saving module to wake up nodes
if (get_conf_with_default_param("ENERGY_SAVING_INTERNAL", "no") eq "yes") {
if (OAR::Modules::Hulot::wake_up_nodes(\@nodes) ) {
oar_error("[MetaSched] Communication problem with Hulot (the energy saving module)!\n");
}
$flagHulot=1;
}
# Not using the built-in energy saving module to wake up nodes
else {
my $cmd = get_conf("SCHEDULER_NODE_MANAGER_WAKE_UP_CMD");
if (! defined(OAR::Tools::fork_and_feed_stdin($cmd, $timeout_cmd, \@nodes))){
oar_error("[MetaSched] Try to launch the command $cmd to wake up some nodes but the command timeouted($timeout_cmd s).\n");
}
}
}
}
# Send CHECK signal to Hulot if needed
if(!$flagHulot and (get_conf_with_default_param("ENERGY_SAVING_INTERNAL", "no") eq "yes")){
if (OAR::Modules::Hulot::check() ) {
oar_error("[MetaSched] Communication problem with Hulot (the energy saving module)!\n");
}
}
# Search jobs to resume
foreach my $j (OAR::IO::get_jobs_in_state($base, "Resuming")){
my @other_jobs = OAR::IO::get_jobs_on_resuming_job_resources($base,$j->{job_id});
# TODO : look for timesharing other jobs. What do we do?????
if ($#other_jobs < 0){
# We can resume the job
oar_debug("[MetaSched] We can resume the job $j\n");
###############
# RESUME PART #
###############
my $script = get_conf("JUST_BEFORE_RESUME_EXEC_FILE");
my $timeout = get_conf("SUSPEND_RESUME_SCRIPT_TIMEOUT");
$timeout = OAR::Tools::get_default_suspend_resume_script_timeout() if (!defined($timeout));
my $skip = 0;
if (defined($script)){
# Launch admin script
my $script_error = 0;
eval {
$SIG{ALRM} = sub { die "alarm\n" };
alarm($timeout);
oar_debug("[OAR_META_SCHED] [$j->{job_id}] LAUNCH the script just after the suspend : $script $j->{job_id}\n");
$script_error = system("$script script $j->{job_id}");
oar_debug("[OAR_META_SCHED] [$j->{job_id}] END the script just after the suspend : $script $j->{job_id}\n");
alarm(0);
};
if( $@ || ($script_error != 0)){
my $str = "[OAR_META_SCHED] [$j->{job_id}] Suspend script error : $@; return code = $script_error\n";
oar_error($str);
OAR::IO::add_new_event($base,"RESUME_SCRIPT_ERROR",$j->{job_id},$str);
OAR::IO::frag_job($base,$j->{job_id});
OAR::Tools::notify_tcp_socket($Remote_host,$Remote_port,"Qdel");
$skip = 1;
}
}
if ((defined($Cpuset_field)) and ($skip == 0)){
my $cpuset_name = OAR::IO::get_job_cpuset_name($base, $j->{job_id}) if (defined($Cpuset_field));
my $cpuset_nodes = OAR::IO::get_cpuset_values_for_a_moldable_job($base,$Cpuset_field,$j->{assigned_moldable_job});
my $suspend_data_hash = {
name => $cpuset_name,
job_id => $j->{job_id},
oarexec_pid_file => OAR::Tools::get_oar_pid_file_name($j->{job_id}),
};
if (defined($cpuset_nodes)){
my $taktuk_cmd = get_conf("TAKTUK_CMD");
my $openssh_cmd = get_conf("OPENSSH_CMD");
$openssh_cmd = OAR::Tools::get_default_openssh_cmd() if (!defined($openssh_cmd));
if (is_conf("OAR_SSH_CONNECTION_TIMEOUT")){
OAR::Tools::set_ssh_timeout(get_conf("OAR_SSH_CONNECTION_TIMEOUT"));
}
my $suspend_file = get_conf("SUSPEND_RESUME_FILE");
$suspend_file = OAR::Tools::get_default_suspend_resume_file() if (!defined($suspend_file));
$suspend_file = "$ENV{OARDIR}/$suspend_file" if ($suspend_file !~ /^\//);
my ($tag,@bad) = OAR::Tools::manage_remote_commands([keys(%{$cpuset_nodes})],$suspend_data_hash,$suspend_file,"resume",$openssh_cmd,$taktuk_cmd,$base);
if ($tag == 0){
my $str = "[OAR_META_SCHED] [SUSPEND_RESUME] [$j->{job_id}] Bad suspend/resume file : $suspend_file\n";
oar_error($str);
OAR::IO::add_new_event($base, "SUSPEND_RESUME_MANAGER_FILE", $j->{job_id}, $str);
}else{
if (($#bad < 0)){
OAR::IO::resume_job_action($base,$j->{job_id});
}else{
my $str = "[OAR_META_SCHED] [SUSPEND_RESUME] [$j->{job_id}] Error on several nodes : @bad\n";
oar_error($str);
OAR::IO::add_new_event_with_host($base,"RESUME_ERROR",$j->{job_id},$str,\@bad);
OAR::IO::frag_job($base,$j->{job_id});
# A Leon must be run
$exit_code = 2;
}
}
}
}
#####################
# RESUME PART, END #
#####################
}
}
# Notify oarsub -I when they will be launched
foreach my $j (OAR::IO::get_gantt_waiting_interactive_prediction_date($base)){
my ($addr,$port) = split(/:/,$j->{info_type});
my $new_start_prediction = OAR::IO::local_to_sql($j->{start_time});
oar_debug("[MetaSched] [$j->{job_id}] Notifying user of the start prediction: $new_start_prediction (".$j->{message}.")\n");
OAR::Tools::notify_tcp_socket($addr,$port,"[$initial_time{sql}] Start prediction: $new_start_prediction (".$j->{message}.")");
}
OAR::IO::disconnect($base);
oar_debug("[MetaSched] End of meta scheduler\n");
exit($exit_code);
|