/usr/lib/condor/shim_dmtcp is in htcondor 8.6.8~dfsg.1-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | #! /bin/sh
# play safe
set -u
# TODO
# Figure out what happens when a program checkpoints while slowly reading
# stdin, does it restart properly?
# DEFAULTS, some not sane, some sane
# This first one is the default for debian installations...
DMTCP_PATH=/usr/bin
# This next one is a default for UW-Madison pools. It is the "release_dir".
CONDOR_PATH=/unsup/condor-production
# These are sane defaults.
LOG="/dev/null"
STDIN="/dev/null"
STDOUT="/dev/null"
STDERR="/dev/null"
# set default checkpoint interval to zero -- not repeated snapshoting
# but on-demand snapshot on vacate command
CKPTINT=${DMTCP_CHECKPOINT_INTERVAL:-0}
# The version of the shim script. Useful for debugging between groups using
# this shim script.
VERSION=0.4
SCRIPTNAME=$(basename $0)
host=`/bin/hostname | /bin/sed -e 's/\.cs\.wisc\.edu//g'`
###############################################################
# cmdline args
###############################################################
print_help()
{
cat << EOT
Usage: $SCRIPTNAME [OPTIONS] <executable> [<args>]
Options:
-h,--help
Print usage summary and option list.
--version
Print version information and exit.
--with-dmtcp path
A path to an installation of DMTCP. If not specified, it defaults to
/usr/lib/dmtcp. The current working directory of the shim_script when
it executes will always be tested for a DMTCP installation before using
whatever is specified with --with-dmtcp regardless of its definition.
The default may change in the future.
--with-condor path
A path to an installation of Condor-aka the "release_dir". If not
specified, it defaults to /unsup/condor-production. The condor functionality
the shim script utilizes is not required by the shim script, but it'll
make debugging certain failures a lot easier.
The default may change in the future.
--log
Logfile name. By default no logfile is created.
--stdin
File with stdin content for the job. By default
no stdin is directed to the job.
--stdout
File to store stdout of the job. By default
no stdout is captured.
--stderr
File to store stderr of the job. By default
no stderr is captured.
--ckptint=<integer>
Checkpointing interval in seconds. $SCRIPTNAME also honors the
DMTCP_CHECKPOINT_INTERVAL environment variable. Default: 1800
EOT
}
print_description()
{
cat << EOT
Shim script for DMTCP-based checkpointing of Condor vanilla universe jobs.
EOT
}
print_version()
{
cat << EOT
$SCRIPTNAME $VERSION
This code is under the Apache V2.0 License. It was written by
Peter Keller <psilord@cs.wisc.edu>.
Additional modifications by:
Michael Hanke <michael.hanke@gmail.com>.
EOT
}
# Parse commandline options
# Note that we use `"$@"' to let each command-line parameter expand to a
# separate word. The quotes around `$@' are essential!
# We need CLOPTS as the `eval set --' would nuke the return value of getopt.
CLOPTS=`getopt -o h --long with-condor:,with-dmtcp:,help,version,log:,stdin:,stdout:,stderr:,ckptint: -n "$SCRIPTNAME" -- "$@"`
if [ $? != 0 ] ; then
echo "Terminating..." >&2
exit 1
fi
# Note the quotes around `$CLOPTS': they are essential!
eval set -- "$CLOPTS"
while true ; do
case "$1" in
-h|--help) print_description; print_help; exit 0;;
--version) print_version; exit 0;;
--with-dmtcp) shift; DMTCP_PATH=$1; shift;;
--with-condor) shift; CONDOR_PATH=$1; shift;;
--log) shift; LOG=$1; shift;;
--stdin) shift; STDIN=$1; shift;;
--stdout) shift; STDOUT=$1; shift;;
--stderr) shift; STDERR=$1; shift;;
--ckptint) shift; CKPTINT=$1; shift;;
--) shift ; break ;;
*) echo "Internal error! ($1)"; exit 1;;
esac
done
if [ ! $# -ge 1 ]; then
printf "Need at least one argument.\n\n"
print_help
exit 1
fi
logit()
{
d=`date +"%D %R:%S %Z"`
echo [$host $d]: "$@" >> $LOG
}
logitnohdr()
{
echo "$@" >> $LOG
}
runcmd()
{
rc_cmd=$@
logit "running command: $rc_cmd"
logit "---BEGIN STDOUT/ERR---"
$rc_cmd >> $LOG 2>&1
logit "---END STDOUT/ERR---"
rc_ret=$?
logit "command exited with: $rc_ret"
return $rc_ret
}
die_bad_manifest ()
{
logit "The required DMTCP file is not present: $1"
logit "This file must exist for the DMTCP shim to work properly."
exit 1
}
# This function asserts that all of the files we need for DMTCP are present.
assert_dmtcp_manifest ()
{
files="dmtcp_checkpoint
dmtcp_coordinator
dmtcp_command
dmtcp_restart
mtcp_restart"
# Do not check the libs on Debian -- package dependency make sure they are
# present and their location is differen
#dmtcphijack.so
#libmtcp.so
logit "Checking DMTCP manifest. An install in cwd overrides default choice."
for file in $files
do
if [ -f "./$file" ] ; then
eval ${file%*.so}="./${file}"
elif [ -f "${DMTCP_PATH}/$file" ]; then
eval ${file%*.so}="${DMTCP_PATH}/${file}"
else
die_bad_manifest "$file"
fi
done
logit "Manifest check passed."
}
# Try to idenitify the machine we are on well enough to debug checkpoint
# permutation testing.
ckptsignature()
{
logit "Hostname: $host"
if [ -f /etc/issue ] ; then
ret=`cat /etc/issue | /usr/bin/head -1`
logit "Linux Flavor: $ret"
else
logit "Linux Flavor: Unknown"
fi
ret=`/bin/uname -a | /usr/bin/awk '{print $3}'`
logit "Kernel: $ret"
ret=`/usr/bin/gcc --version | /usr/bin/head -1`
logit "GCC revision: $ret"
# See if there is a checkpoint platform
if [ -x "${CONDOR_PATH}/bin/condor_status" ] ; then
ckptsig=`${CONDOR_PATH}/bin/condor_status -l $host 2>&1 | /bin/egrep -i '^checkpointplatform' | /usr/bin/uniq`
logit "Checkpoint platform: $ckptsig"
else
logit "Checkpoint platform: Unknown"
fi
}
delay()
{
val=${1:-3}
logit "sleeping $val seconds"
/bin/sleep $val
}
checkpoint()
{
logit "Soft kill signal received by $0 [pid=$$]: Starting checkpoint..."
runcmd ${dmtcp_command} --quiet --port $port bc
delay
# when this returns, the checkpointing should be done.
logit "what is the coordinator status"
runcmd ${dmtcp_command} --quiet --port $port s
# kill everything off.
logit "killing process hierarchy (coordinator also goes away)"
runcmd ${dmtcp_command} --quiet --port $port k
delay
# tell me if ./dmtcp_restart_script.sh is present, which is written by
# the dmtcp_coordinator when I asked for the checkpoint, cause if it isn't
# then that is a DMTCP bug (or full file system!)
if [ -f "./dmtcp_restart_script.sh" ] ; then
logit "Found dmtcp_restart_script.sh. Good!"
# if it is a symlink, convert it to a real file
if [ -h "./dmtcp_restart_script.sh" ] ; then
logit "Converting symlink ./dmtcp_restart_script.sh to real file."
if [ -x /usr/bin/readlink ] ; then
realfile=`/usr/bin/readlink ./dmtcp_restart_script.sh`
else
realfile=`/bin/readlink ./dmtcp_restart_script.sh`
fi
/bin/rm ./dmtcp_restart_script.sh
/bin/mv "$realfile" "./dmtcp_restart_script.sh"
if [ -f "$realfile" ] ; then
logit "Problem: $realfile was not converted to ./dmtcp_restart_script.sh"
fi
fi
logit "Next invocation is restart!"
else
logit "Didn't find dmtcp_restart_script.sh. Next invocation is initial!"
logit "This is bad and means DMTCP failed to write it, or a full fs."
fi
logit "About to exit 0 in signal handler after sleeping 2 seconds."
logit "Checkpointing continuation on $host"
logitnohdr " |"
logitnohdr " V"
logitnohdr " *"
/bin/sleep 2
exit 0
}
# If we've already checkpointed once, then return true, otherwise, false.
is_restart()
{
if [ -f "./dmtcp_restart_script.sh" ]; then
return 1
fi
return 0
}
###############################################################
# start
###############################################################
# This goes to the stdout specified in the job.sub file, since that file
# gets truncated every restart, we keep the useful information in a log file
# specified to the shim script.
echo "Please see the file $LOG for what happened to this job."
echo "Shim Script Version: $VERSION"
# run or restart the job.
is_restart
restart=$?
# Now what are we tasked to do?
if [ $restart -eq 0 ]; then
logitnohdr " -"
logitnohdr " |"
logitnohdr " V"
logit "Initial shim script (Version: $VERSION) invocation on $host"
else
logitnohdr " |"
logitnohdr " V"
logit "Resumption shim script (Version: $VERSION) invocation on $host"
fi
logit "Shim script start [pid: $$]"
logit "STDIN: $STDIN"
logit "STDOUT: $STDOUT"
logit "STDERR: $STDERR"
# ensure we have everything we need, otherwise bail!
assert_dmtcp_manifest
# WARNING:
# Close the gcb inherited fd which was left open to an unlinked file when the
# job executes under a gcb enabled execute node.
# DMTCP croaks on it because it can't restore it.
FD=`/usr/bin/env | /bin/grep CB_INHERITFILE | /bin/sed -e 's/.*=//'`
if [ -n "$FD" ] ; then
logit "Closing secretly inherited GCB fd $FD"
IFD=`/usr/bin/expr "$FD"`
eval "exec $IFD>&-"
if [ $? = 1 ] ; then
logit "Oops! The close didn't happen properly! Attempting to continue."
else
logit "GCB fd $FD closed."
fi
fi
# What kind of a checkpoint signature does this machine have?
ckptsignature
# Each job will have its own dmtcp coordinator that exits when the job finishes.
logit "starting the dmtcp_coordinator process"
# We start it on an ephemeral port and save it for later.
port=`${dmtcp_coordinator} --port 0 --exit-on-last --interval ${CKPTINT} --daemon 2>&1 | grep "Port:" | /bin/sed -e 's/Port://g' -e 's/[ \t]//g'`
if [ $? -eq 0 ]; then
logit "started dmtcp_coordinator on port $port with checkpoint interval ${CKPTINT}"
else
logit "could not start dmtcp_coordinator"
fi
# see if the port if defined
if [ "x$port" = "x" ]; then
logit "dmtcp_coordinator port is unknown! Aborting."
exit 1
fi
# give it time to wake up.
delay 2
# catch sigint, checkpoint and exit on vacate.
logit "Setting signal trap for SIGINT [2]"
trap checkpoint 2
if [ $restart -eq 0 ]; then
logit "running application for the first time"
# Don't run this with runcmd, since I want the stdout/err of the process to
# flow through this shell.
# XXX if there are literal spaces in the arguments, this screws up!
# XXX put --join back once https://bugs.debian.org/765741 is resolved
${dmtcp_checkpoint} --port $port "$@" <$STDIN 1>$STDOUT 2>$STDERR &
wait %1
ret=$?
logit "wait returned with value $ret!"
if [ $ret -gt 128 ] ; then
logit "got vacate signal, waiting until signal handler exit"
while [ true ]
do
:
done
else
logit "Removing DMTCP checkpoint files"
rm -f dmtcpConTable.${DMTCP_PREFIX_ID}*
fi
logit "Program terminated."
exit $ret
else
# We inherit DMTCP_PREFIX_ID from the environment
logit "Restarting application: "
DMTCP_PORT=$port
DMTCP_HOST=$host
DMTCP_RESTART_DIR=./
# This next line is because the dmtcp_restart_script.sh program expects
# to find dmtcp_restart in the path.
PATH=.:$PATH
export DMTCP_PORT
export DMTCP_HOST
export DMTCP_RESTART_DIR
export PATH
logit "PATH=.:$PATH"
logit "DMTCP_PORT=$DMTCP_PORT"
logit "DMTCP_HOST=$DMTCP_HOST"
logit "DMTCP_RESTART_DIR=$DMTCP_RESTART_DIR"
logit "./dmtcp_restart_script.sh"
./dmtcp_restart_script.sh &
wait %1
ret=$?
logit "Application returned: $ret"
logit "Removing DMTCP checkpoint files"
rm -f ckpt*${DMTCP_PREFIX_ID}*.dmtcp
rm -f dmtcp*Table*${DMTCP_PREFIX_ID}*
rm -f dmtcp*Map*${DMTCP_PREFIX_ID}*
rm -f dmtcp*List*${DMTCP_PREFIX_ID}*
rm -f dmtcp_restart_script.sh
exit $ret;
fi
logit "Should never get here!"
exit 1;
|