This file is indexed.

/usr/lib/oar/sarko is in oar-server 2.5.7-3.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/perl
# $Id$
#Almighty module : check walltimes and jobs to frag
use strict;
use DBI();
use Data::Dumper;
use OAR::IO;
use OAR::Modules::Judas qw(oar_debug oar_warn oar_error set_current_log_category);
use OAR::Conf qw(init_conf dump_conf get_conf is_conf);
use OAR::Tools;

# Log category
set_current_log_category('main');

# Get job delete and checkpoint walltime values
my $Leon_soft_walltime = OAR::Tools::get_default_leon_soft_walltime();
my $Leon_walltime = OAR::Tools::get_default_leon_walltime();
init_conf($ENV{OARCONFFILE});
if (is_conf("JOBDEL_SOFTWALLTIME")){
    $Leon_soft_walltime = get_conf("JOBDEL_SOFTWALLTIME");
}
if (is_conf("JOBDEL_WALLTIME")){
    $Leon_walltime = get_conf("JOBDEL_WALLTIME");
}

if ($Leon_walltime <= $Leon_soft_walltime){
    $Leon_walltime = $Leon_soft_walltime + 1;
    oar_warn("[sarko] (JOBDEL_WALLTIME <= JOBDEL_SOFTWALLTIME) so I must set JOBDEL_WALLTIME to $Leon_walltime\n");
}

my $Server_hostname = get_conf("SERVER_HOSTNAME");

my $Deploy_hostname = get_conf("DEPLOY_HOSTNAME");
if (!defined($Deploy_hostname)){
    $Deploy_hostname = $Server_hostname;
}

my $Cosystem_hostname = get_conf("COSYSTEM_HOSTNAME");
if (!defined($Cosystem_hostname)){
    $Cosystem_hostname = $Server_hostname;
}

my $Openssh_cmd = get_conf("OPENSSH_CMD");
$Openssh_cmd = OAR::Tools::get_default_openssh_cmd() if (!defined($Openssh_cmd));

if (is_conf("OAR_SSH_CONNECTION_TIMEOUT")){
    OAR::Tools::set_ssh_timeout(get_conf("OAR_SSH_CONNECTION_TIMEOUT"));
}

if (is_conf("OAR_RUNTIME_DIRECTORY")){
    OAR::Tools::set_default_oarexec_directory(get_conf("OAR_RUNTIME_DIRECTORY"));
}

oar_debug("[sarko] JOBDEL_SOFTWALLTIME = $Leon_soft_walltime; JOBDEL_WALLTIME = $Leon_walltime\n");

# get script args
my $base = OAR::IO::connect();
if (!defined($base)){
    oar_error("[sarko] Can not connect to the database\n");
    exit(1);
}

oar_debug("[sarko] Hello, identity control !!!\n");

my $guilty_found=0;
my $current = OAR::IO::get_date($base);

# Look at leon timers
# Decide if OAR must retry to delete the job or just change values in the database
foreach my $j (OAR::IO::get_timered_job($base)){
    my $job_ref = OAR::IO::get_job($base,$j->{job_id});
    if (($job_ref->{state} eq "Terminated") || ($job_ref->{state} eq "Error") || ($job_ref->{state} eq "Finishing")){
        OAR::IO::job_fragged($base,$j->{job_id});
        oar_debug("[sarko] I set to FRAGGED the job $j->{job_id}\n");
    }else{
        my $frag_date = OAR::IO::get_frag_date($base,$j->{job_id});
        oar_debug("[sarko] frag date : $frag_date , $frag_date\n");
        if (($current > $frag_date+$Leon_soft_walltime) && ($current <= $frag_date+$Leon_walltime)){
            oar_debug("[sarko] Leon will RE-FRAG bipbip of job $j->{job_id}\n");
            OAR::IO::job_refrag($base,$j->{job_id});
            $guilty_found=1;
        }elsif ($current > $frag_date+$Leon_walltime){
            oar_debug("[sarko] Leon will EXTERMINATE bipbip of job $j->{job_id}\n");
            OAR::IO::job_leon_exterminate($base,$j->{job_id});
            $guilty_found=1;
        }else{
            oar_debug("[sarko] The leon timer is not yet expired for the job $j->{job_id}; I do nothing\n");
        }
    }
}


# Look at job walltimes
foreach my $job (OAR::IO::get_jobs_in_state($base, "Running")){
    my ($start, $max);

    # Get starting time
    $start = $job->{start_time};

    # Get maxtime
    my $mold_job = OAR::IO::get_current_moldable_job($base,$job->{assigned_moldable_job});
    $max = $mold_job->{moldable_walltime};
    if ($job->{suspended} eq "YES"){
        # This job was suspended so we must recalculate the walltime
        $max += OAR::IO::get_job_suspended_sum_duration($base,$job->{job_id},$current);
    }

    oar_debug("[sarko] Job [$job->{job_id}] from $start with $max; current time=$current\n");
    if ($current > $start+$max){
        oar_debug("--> (Elapsed)\n");
        $guilty_found=1;
        OAR::IO::lock_table($base,["frag_jobs","event_logs","jobs"]);
        OAR::IO::frag_job($base, $job->{job_id});
        OAR::IO::unlock_table($base);
        OAR::IO::add_new_event($base,"WALLTIME",$job->{job_id},"[sarko] Job [$job->{job_id}] from $start with $max; current time=$current (Elapsed)");
    }elsif (($job->{checkpoint} > 0) && ($current >= ($start+$max-$job->{checkpoint}))){
        # OAR must notify the job to checkpoint itself
        oar_debug("[sarko] Send checkpoint signal to the job $job->{job_id}\n");
        # Retrieve node names used by the job
        my @hosts = OAR::IO::get_job_current_hostnames($base,$job->{job_id});
        my $types = OAR::IO::get_job_types_hash($base,$job->{job_id});
        my $host_to_connect = $hosts[0];
        if ((defined($types->{cosystem})) or ($#hosts < 0)){
            $host_to_connect = $Cosystem_hostname;
        }elsif (defined($types->{deploy})){
            $host_to_connect = $Deploy_hostname;
        }
        OAR::IO::add_new_event($base,"CHECKPOINT",$job->{job_id},"User oar (sarko) requested a checkpoint on the job $job->{job_id} on $host_to_connect");
        my $str_comment;
        my @exit_codes;
        # Timeout the ssh command
        eval {
            $SIG{ALRM} = sub { die "alarm\n" };
            alarm(OAR::Tools::get_ssh_timeout());
            @exit_codes = OAR::Tools::signal_oarexec($host_to_connect,$job->{job_id},"SIGUSR2",1,$base, $Openssh_cmd, '');
            alarm(0);
        };
        if ($@){
            if ($@ eq "alarm\n"){
                $str_comment = "[sarko] Cannot contact $hosts[0], operation timouted (".OAR::Tools::get_ssh_timeout()." s). So I cannot send checkpoint signal to the job $job->{job_id} on $host_to_connect";
                oar_warn("$str_comment\n");
                OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
            }else{
                $str_comment = "[sarko] An unknown error occured during the sending of the checkpoint signal to the job $job->{job_id} on the host $host_to_connect";
                oar_warn("$str_comment\n");
                OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
            }
        }else{
            if ($exit_codes[0] == 0){
                $str_comment = "[sarko] The job $job->{job_id} was notified to checkpoint itself on the node $host_to_connect";
                oar_debug("$str_comment\n");
                OAR::IO::add_new_event($base,"CHECKPOINT_SUCCESSFULL",$job->{job_id},$str_comment);
            }else{
                $str_comment = "[sarko] The kill command return a bad exit code (@exit_codes) for the job $job->{job_id} on the node $host_to_connect";
                oar_warn("$str_comment\n");
                OAR::IO::add_new_event($base,"CHECKPOINT_ERROR",$job->{job_id},$str_comment);
            }
        }
    }
}



# Retrieve nodes with expiry_dates in the past
# special for Desktop computing
my @resources = OAR::IO::get_expired_resources($base);
if ($#resources >= 0) {
    # First mark the nodes as dead
    foreach my $r (@resources) {
        OAR::IO::set_resource_nextState($base, $r, 'Suspected');
        my $rinfo = OAR::IO::get_resource_info($base, $r);
        OAR::IO::add_new_event_with_host($base, "LOG_SUSPECTED", 0, "The DESKTOP COMPUTING resource $r has expired on node $rinfo->{network_address}", [$rinfo->{network_address}]);
    }
    # Then notify Almighty
    my $remote_host = get_conf("SERVER_HOSTNAME");
    my $remote_port = get_conf("SERVER_PORT");

    OAR::Tools::notify_tcp_socket($remote_host,$remote_port,"ChState");
}

my $dead_switch_time = OAR::Tools::get_default_dead_switch_time();
if (is_conf("DEAD_SWITCH_TIME")){
    $dead_switch_time = get_conf("DEAD_SWITCH_TIME");
}
# Get Absent and Suspected nodes for more than 5 mn (default)
if ($dead_switch_time > 0){
    my $notify = 0;
    foreach my $r (OAR::IO::get_absent_suspected_resources_for_a_timeout($base,$dead_switch_time)){
        OAR::IO::set_resource_nextState($base,$r,"Dead");
        OAR::IO::update_resource_nextFinaudDecision($base,$r,"YES");
        oar_debug("[Sarko] Set the next state of $r to Dead\n");
        $notify = 1;
    }
    if ($notify > 0){
        my $remote_host = get_conf("SERVER_HOSTNAME");
        my $remote_port = get_conf("SERVER_PORT");
        OAR::Tools::notify_tcp_socket($remote_host,$remote_port,"ChState");
    }
}
                                                                             

OAR::IO::disconnect($base);

exit($guilty_found);