/usr/share/arc/scan-pbs-job is in nordugrid-arc-arex 1.1.1-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 | #!/bin/bash
#
# Periodically read log files of PBS and put mark files
# for job, which finished.
# If log files are not available scan for finished (absent) jobs
# in PBS and put mark files for job, which finished.
#
# usage: scan_pbs_job control_dir ...
# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
if [ -z "$1" ] ; then exit 1 ; fi
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
libexecdir="${ARC_LOCATION:-/usr}/lib/arc/"
pkgdatadir="$basedir"
. ${pkgdatadir}/configure-pbs-env.sh || exit $?
# Assume that gm-kick and scan_common is installed in the same directory
GMKICK=${libexecdir}/gm-kick
. "${pkgdatadir}/scan_common.sh" || exit $?
# Where to store temporary files
TMPDIR=${TMPDIR:-/tmp}
# directory containing PBS server logs
pbs_log_dir=${CONFIG_pbs_log_path:-/var/spool/pbs/server_logs}
RUNTIME_NODE_SEES_FRONTEND=$CONFIG_shared_filesystem
#default is NFS
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
RUNTIME_NODE_SEES_FRONTEND=yes
fi
# locally empty means no
if [ "${RUNTIME_NODE_SEES_FRONTEND}" = 'no' ] ; then
RUNTIME_NODE_SEES_FRONTEND=
fi
# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
control_dirs="${control_dirs} \"$1\""
shift
done
my_id=`id -u`
state_file=$control_dir/pbs_log_scan.`id -un`
lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( $lines + 0 ))
ldate=$(( $ldt + 0 ))
if [ -z "$lines_skip" ] ; then lines_skip='0' ; fi
if [ -z "$ldate" ] ; then ldate='0' ; fi
find_by_local() {
eval "set -- $control_dirs"
for ctr_dir in "$@"; do
find ${ctr_dir}/processing -name 'job.*.status' -print0 \
| sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
| xargs -0 grep -F -l "localid=$job_id" 2>/dev/null
done \
| head -n 1
}
find_by_grami() {
eval "set -- $control_dirs"
for ctr_dir in "$@"; do
find ${ctr_dir}/processing -name 'job.*.status' -print0 \
| sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.grami/g' \
| xargs -0 grep -F -l "joboption_jobid=$job_id" 2>/dev/null
done \
| sed 's/\.grami$/.local/' \
| head -n 1
}
# set_job_vars takes a line from pbs logs and splits it, returning information
# in pbs_date, pbs_code, pbs_server, pbs_job, job_id, job_message and rest_line
set_job_vars() {
pbs_date=$1
pbs_code=$2
pbs_server=$3
pbs_job=$4
job_id=$5
job_message=$6
rest_line=$7
}
#
# Main function for processing one PBS log.
# Extracts log lines with code 0010 (job exited) and 0008 (job killed)
#
# TODO this should be split into smaller functions
process_log_file () {
eval "set -- $control_dirs"
#we grep for finished jobs, then use sed to remove already processed lines
#OBS: deleted jobs have a 0008 message with not much info in it. A 0010
# message may follow (or not) with full usage stats. By this time the
# job has already been processed, so this info is ignored!
#TODO: make log scanning more intelligent.
exited_killed_jobs=`egrep '^[^;]*;0010;[^;]*;Job;|^[^;]*;0008;[^;]*;Job;[^;]*;Exit_status=|^[^;]*;0008;[^;]*;Job;[^;]*;Job deleted' ${lname} | tail -n+$(( $lines_skip + 1 ))`
#TODO should we add processed lines before jobs have actually been processed? What if the last job only has half a record?
new_lines=`echo "$exited_killed_jobs" | wc -l`
# new_lines set to 1 when string is empty, should have been 0
[ "x$exited_killed_jobs" = x ] && continue
lines_processed=$(( $lines_skip + $new_lines ))
if [ "$lines_processed" -lt '0' ] ; then
lines_processed=0;
fi
echo "$cname $lines_processed"> $state_file
exited_killed_jobs=`echo "$exited_killed_jobs" | sort -u`
# force word splitting to happen only on newlines
old_IFS=$IFS; IFS='
'
for job in $exited_killed_jobs; do
# Split line into fields by forcing word splitting to happen on ";"
IFS=";"
set_job_vars $job
IFS=$old_IFS
# Try to extract exit code of PBS (note: if executable fails it's code goes to PBS)
exit_code=`echo "$job_message" | sed -n 's/^.*Exit_status=\([-0-9]*\).*/\1/p'`
job_id=`echo "$job_id" | awk '{split($0,field,".");print field[1]"."field[2]}'`
# look for this id in job.ID.local, then in job.ID.grami
name=`find_by_local`
if [ -z "$name" ]; then
name=`find_by_grami`
if [ -z "$name" ]; then continue; fi
fi
if [ "$my_id" != '0' ] ; then
if [ ! -O "$name" ] ; then continue ; fi
fi
uid=$(get_owner_uid "$name")
[ -z "$uid" ] && { log "Failed to stat $name"; continue; }
base_name=`echo "$name" 2>/dev/null | sed -n 's/\.local$//p'`
if [ -z "${base_name}" ] ; then continue ; fi
# check if job already reported
if [ -f "${base_name}.lrms_done" ] ; then continue ; fi
statusfile=`echo "$name" 2>/dev/null | sed -n 's/job\.\([^\.]*\)\.local$/processing\/job.\1.status/p'`
# more protection - check if grid-manager thinks job is still running
egrep 'INLRMS|SUBMIT|CANCELING' "$statusfile" >/dev/null 2>&1
if [ ! $? = '0' ] ; then continue ; fi
# So far only PBS exit code is available
# It would be nice to have exit code of main executable
exitcode=''
# get session directory of this job
sessiondir=`grep -h '^sessiondir=' "${base_name}.local" | sed 's/^sessiondir=\(.*\)/\1/'`
diagfile="${sessiondir}.diag"
commentfile="${sessiondir}.comment"
if [ -z "$sessiondir" ] ; then
log "Failed to determine the path of the job's session directory"
else
# have chance to obtain exit code
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
# In case of non-NFS setup it may take some time till
# diagnostics file is delivered. Wait for it max 2 minutes.
# OBS: exitcode may never appear in the .diag file if the job was
# killed. There will be a 2 minute delay for every such job!
diag_tries=0
while [ "$diag_tries" -lt 20 ] ; do
job_read_diag # uses $sessiondir, $uid
if [ ! -z "$exitcode" ] ; then break ; fi
sleep 10
diag_tries=$(( $diag_tries + 1 ))
log "no exitcode in diag file $diagfile (try $diag_tries of 20)"
done
else
job_read_diag # uses $sessiondir, $uid
fi
fi
# Try to obtain message from PBS if any
pbs_comment=$(do_as_uid "$uid" "tail -n 1 '$commentfile'")
save_commentfile "$uid" "$commentfile" "${base_name}.errors"
# Extract values from PBS
walltime=`echo "$job_message" | sed -n 's/^.*resources_used.walltime=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/p'`
cputime=`echo "$job_message" | sed -n 's/^.*resources_used.cput=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/p'`
mem=`echo "$job_message" | sed -n 's/^.*resources_used.mem=\([0-9]*\)kb.*/\1/p'`
vmem=`echo "$job_message" | sed -n 's/^.*resources_used.vmem=\([0-9]*\)kb.*/\1/p'`
# Convert to utc and store as seconds
date_to_utc_seconds "$pbs_date"
if [ ! -z "$return_date_seconds" ]; then
# Convert from seconds to YYYYMMDDHHMMSSZ
seconds_to_mds_date "$return_date_seconds"
endtime=$return_mds_date
# Find out how many seconds the job executed
interval_to_seconds "$walltime"
if [ ! -z "$return_interval_seconds" ]; then
# Convert from seconds to YYYYMMDDHHMMSSZ
seconds_to_mds_date $(( $return_date_seconds - $return_interval_seconds ))
starttime=$return_mds_date
fi
fi
# Values to write to diag. These will override values already written.
interval_to_seconds "$walltime"
[ -n "$return_interval_seconds" ] && WallTime=$return_interval_seconds
interval_to_seconds "$cputime"
[ -n "$return_interval_seconds" ] && UserTime=$return_interval_seconds
[ -n "$return_interval_seconds" ] && KernelTime=0
[ -n "$mem" ] && UsedMemory=$mem
[ -n "$vmem" ] && TotalMemory=$vmem
[ -n "$starttime" ] && LRMSStartTime=$starttime
[ -n "$endtime" ] && LRMSEndTime=$endtime
[ -n "$pbs_comment" ] && LRMSMessage=$pbs_comment
[ -n "$exit_code" ] && LRMSExitcode=$exit_code
job_write_diag
if [ -z "$exitcode" ] ; then
# No exit code of job means job was most probably killed
if [ -z "$exit_code" ] ; then exit_code='-1'; fi
if [ "$exit_code" = '0' ] ; then
echo "Job $job_id failed but PBS have not noticed that" 1>&2
echo "-1 Job failed but PBS reported 0 exit code." > "${base_name}.lrms_done"
elif [ -z "$pbs_comment" ] ; then
echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
else
echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
fi
else
if [ -z "$exit_code" ] ; then exit_code='-1'; fi
if [ ! "$exitcode" = 0 ] ; then
if [ "$exit_code" = '0' ] ; then exit_code='-1'; fi
echo "Job $job_id failed with exit code $exitcode, PBS reported $exit_code." 1>&2
echo "$exit_code Job failed with exit code $exitcode." > "${base_name}.lrms_done"
else
if [ ! "$exit_code" = '0' ] ; then
echo "Job finished properly but PBS reported $exit_code." 1>&2
if [ -z "$pbs_comment" ] ; then
echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
else
echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
fi
else
# echo "Job finished without errors." 1>&2
echo "0" > "${base_name}.lrms_done"
fi
fi
fi
# wake up GM
${GMKICK} "${base_name}.local"
done
IFS=$old_IFS
}
readable_logs=no
# Check $pbs_log_dir for readable files
# if any are found, process them and update relevant information
if [ ! -z "${pbs_log_dir}" ] ; then
for cname in `ls -1 ${pbs_log_dir}/ 2>/dev/null | grep '^[0-9]*$'` ; do
lname="${pbs_log_dir}/$cname"
if [ ! -r "$lname" ] ; then continue ; fi
readable_logs=yes
if [ "$cname" -lt "$ldate" ] ; then
continue
elif [ "$cname" -gt "$ldate" ] ; then
lines_skip=0
fi
echo "Date: " $cname
last_modified=`stat $lname | grep Modify`
process_log_file
done
fi
# main loop, stay here up to 60 seconds if log is still updated while
# we are reading it.
if [ "$readable_logs" = 'yes' ] ; then
time_count=0
while true ; do
new_modified=`stat $lname | grep Modify`
if [ "$new_modified" != "$last_modified" ] ; then
last_modified="$new_modified"
lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( $lines + 0 ))
ldate=$(( $ldt + 0 ))
process_log_file
fi
sleep 10
time_count=$(( $time_count + 1 ))
if [ "$time_count" -gt 60 ] ; then break ; fi
done
exit 0
fi
# If no PBS logs found try ordinary 'qstat'
eval "set -- $control_dirs"
# Get all running jobs
pidslist=`mktemp "$TMPDIR/qstat.XXXXXX"` ||
if [ ! "$?" = '0' ] ; then
# FS problems ?
# TODO debug output here
sleep 60
exit 1
fi
${PBS_BIN_PATH}/qstat -a 2>/dev/null 1>"$pidslist"
if [ ! "$?" = '0' ] ; then
rm -f "$pidslist"
# PBS server down ?
sleep 60
exit 1
fi
exclude_completed () {
awk '$10!="C"{print $0}'
}
pids=`cat "$pidslist" | grep '^[0-9][0-9]*\.' | exclude_completed | sed 's/^\([0-9][0-9]*\).*/\1/'`
rm -f "$pidslist"
# Go through directories
for ctr_dir in "$@" ; do
# Obtain ids stored in job.*.local
ids=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
| sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
| xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`
if [ -z "$ids" ] ; then continue ; fi
# compare them to running jobs and find missing
bids=
for id in $ids ; do
found=`echo "$pids" | grep "^$id$"`
if [ -z "$found" ] ; then
bids="$bids $id"
fi
done
# go through missing ids
for id in $bids ; do
# find grid job corresponding to curent local id
jobfile=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
| sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
| xargs -0 grep -F -l "localid=$id." 2>/dev/null`
if [ -z "$jobfile" ] ; then continue ; fi
if [ "$my_id" != '0' ] ; then
if [ ! -O "$jobfile" ] ; then continue ; fi
fi
uid=$(get_owner_uid "$jobfile")
[ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }
# extract grid id
gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
donefile="${ctr_dir}/job.${gridid}.lrms_done"
if [ -f "$donefile" ] ; then continue ; fi
statusfile="${ctr_dir}/processing/job.${gridid}.status"
if [ ! -f "$statusfile" ] ; then continue ; fi
status=`cat "$statusfile"`
if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ]; then continue ; fi
# get session directory of this job
session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
if [ ! -z "$session" ] ; then
# have chance to obtain exit code
diagfile="${session}.diag"
if [ ! -z "$session" ] ; then
# have chance to obtain exit code
exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
fi
if [ ! -z "$exitcode" ] ; then
# job finished and exit code is known
save_commentfile "$uid" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
${GMKICK} "$jobfile"
echo "Job $gridid finished with exit code $exitcode"
continue
fi
fi
# job has probaly finished and exit code is not known
exitcode='-1'
countfile="${ctr_dir}/job.${gridid}.lrms_job"
counter=0
if [ -f "$countfile" ] ; then
counter=`cat "$countfile"`
counter=$(( $counter + 1 ))
fi
if [ "$counter" -gt 5 ] ; then
rm -f "$countfile"
save_commentfile "$uid" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
echo "$exitcode Job was lost with unknown exit code" > "$donefile"
${GMKICK} "$jobfile"
echo "Job $gridid finished with unknown exit code"
else
echo "$counter" > "$countfile"
fi
done
# go through existing ids
for id in $pids ; do
# find grid job corresponding to curent local id
jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
if [ -z "$jobfile" ] ; then continue ; fi
gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
countfile="${ctr_dir}/job.${gridid}.lrms_job"
# reset failure counter
rm -f "$countfile"
done
done
sleep 60
exit 0
|