This file is indexed.

/usr/lib/condor/shim_dmtcp is in htcondor 8.6.8~dfsg.1-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#! /bin/sh

# play safe
set -u

# TODO
# Figure out what happens when a program checkpoints while slowly reading
# stdin, does it restart properly?

# DEFAULTS, some not sane, some sane

# This first one is the default for debian installations...
DMTCP_PATH=/usr/bin

# This next one is a default for UW-Madison pools. It is the "release_dir".
CONDOR_PATH=/unsup/condor-production

# These are sane defaults.
LOG="/dev/null"
STDIN="/dev/null"
STDOUT="/dev/null"
STDERR="/dev/null"
# set default checkpoint interval to zero -- not repeated snapshoting
# but on-demand snapshot on vacate command
CKPTINT=${DMTCP_CHECKPOINT_INTERVAL:-0}

# The version of the shim script. Useful for debugging between groups using
# this shim script.
VERSION=0.4
SCRIPTNAME=$(basename $0)

host=`/bin/hostname | /bin/sed -e 's/\.cs\.wisc\.edu//g'`

###############################################################
# cmdline args
###############################################################

print_help()
{
cat << EOT
Usage:  $SCRIPTNAME [OPTIONS] <executable> [<args>]

Options:
-h,--help
  Print usage summary and option list.

--version
  Print version information and exit.

--with-dmtcp path
  A path to an installation of DMTCP. If not specified, it defaults to
  /usr/lib/dmtcp. The current working directory of the shim_script when
  it executes will always be tested for a DMTCP installation before using
  whatever is specified with --with-dmtcp regardless of its definition. 
  The default may change in the future.

--with-condor path
  A path to an installation of Condor-aka the "release_dir". If not 
  specified, it defaults to /unsup/condor-production. The condor functionality
  the shim script utilizes is not required by the shim script, but it'll 
  make debugging certain failures a lot easier.
  The default may change in the future.

--log
  Logfile name. By default no logfile is created.

--stdin
  File with stdin content for the job. By default
  no stdin is directed to the job.

--stdout
  File to store stdout of the job. By default
  no stdout is captured.

--stderr
  File to store stderr of the job. By default
  no stderr is captured.

--ckptint=<integer>
  Checkpointing interval in seconds. $SCRIPTNAME also honors the
  DMTCP_CHECKPOINT_INTERVAL environment variable. Default: 1800
EOT
}

print_description()
{
cat << EOT
Shim script for DMTCP-based checkpointing of Condor vanilla universe jobs.

EOT
}

print_version()
{
cat << EOT
$SCRIPTNAME $VERSION

This code is under the Apache V2.0 License. It was written by
Peter Keller <psilord@cs.wisc.edu>.

Additional modifications by:
    Michael Hanke <michael.hanke@gmail.com>.

EOT
}

# Parse commandline options
# Note that we use `"$@"' to let each command-line parameter expand to a
# separate word. The quotes around `$@' are essential!
# We need CLOPTS as the `eval set --' would nuke the return value of getopt.
CLOPTS=`getopt -o h --long with-condor:,with-dmtcp:,help,version,log:,stdin:,stdout:,stderr:,ckptint: -n "$SCRIPTNAME" -- "$@"`

if [ $? != 0 ] ; then
  echo "Terminating..." >&2
  exit 1
fi

# Note the quotes around `$CLOPTS': they are essential!
eval set -- "$CLOPTS"

while true ; do
  case "$1" in
    -h|--help) print_description; print_help; exit 0;;
    --version) print_version; exit 0;;
    --with-dmtcp) shift; DMTCP_PATH=$1; shift;;
    --with-condor) shift; CONDOR_PATH=$1; shift;;
    --log) shift; LOG=$1; shift;;
    --stdin) shift; STDIN=$1; shift;;
    --stdout) shift; STDOUT=$1; shift;;
    --stderr) shift; STDERR=$1; shift;;
    --ckptint) shift; CKPTINT=$1; shift;;
    --) shift ; break ;;
    *) echo "Internal error! ($1)"; exit 1;;
  esac
done

if [ ! $# -ge 1 ]; then
  printf "Need at least one argument.\n\n"
  print_help
  exit 1
fi

logit()
{
	d=`date +"%D %R:%S %Z"`

	echo [$host $d]: "$@" >> $LOG
}

logitnohdr()
{
	echo "$@" >> $LOG
}

runcmd()
{
	rc_cmd=$@
	logit "running command: $rc_cmd"
	logit "---BEGIN STDOUT/ERR---"
	$rc_cmd >> $LOG 2>&1
	logit "---END STDOUT/ERR---"
	rc_ret=$?
	logit "command exited with: $rc_ret"
	return $rc_ret
}

die_bad_manifest ()
{
	logit "The required DMTCP file is not present: $1"
	logit "This file must exist for the DMTCP shim to work properly."
	exit 1
}

# This function asserts that all of the files we need for DMTCP are present.
assert_dmtcp_manifest ()
{
	files="dmtcp_checkpoint
dmtcp_coordinator
dmtcp_command
dmtcp_restart
mtcp_restart"

# Do not check the libs on Debian -- package dependency make sure they are
# present and their location is differen
#dmtcphijack.so
#libmtcp.so
	
	logit "Checking DMTCP manifest. An install in cwd overrides default choice."

	for file in $files
	do
		if [ -f "./$file" ] ; then
			eval ${file%*.so}="./${file}"
		elif [ -f "${DMTCP_PATH}/$file" ]; then
			eval ${file%*.so}="${DMTCP_PATH}/${file}"
		else
			die_bad_manifest "$file"
		fi
	done

	logit "Manifest check passed."
}

# Try to idenitify the machine we are on well enough to debug checkpoint
# permutation testing.
ckptsignature()
{
	logit "Hostname: $host"

	if [ -f /etc/issue ] ; then
		ret=`cat /etc/issue | /usr/bin/head -1`
		logit "Linux Flavor: $ret"
	else
		logit "Linux Flavor: Unknown"
	fi

	ret=`/bin/uname -a | /usr/bin/awk '{print $3}'`
	logit "Kernel: $ret"

	ret=`/usr/bin/gcc --version | /usr/bin/head -1`
	logit "GCC revision: $ret"

	# See if there is a checkpoint platform
	if [ -x "${CONDOR_PATH}/bin/condor_status" ] ; then
		ckptsig=`${CONDOR_PATH}/bin/condor_status -l $host 2>&1 | /bin/egrep -i '^checkpointplatform' | /usr/bin/uniq`
		logit "Checkpoint platform: $ckptsig"
	else
		logit "Checkpoint platform: Unknown"
	fi
}

delay()
{
	val=${1:-3}

	logit "sleeping $val seconds"
	/bin/sleep $val
}

checkpoint()
{
	logit "Soft kill signal received by $0 [pid=$$]: Starting checkpoint..."
	runcmd ${dmtcp_command} --quiet --port $port bc

	delay

	# when this returns, the checkpointing should be done.
	logit "what is the coordinator status"
	runcmd ${dmtcp_command} --quiet --port $port s

	# kill everything off.
	logit "killing process hierarchy (coordinator also goes away)"
	runcmd ${dmtcp_command} --quiet --port $port k

	delay

	# tell me if ./dmtcp_restart_script.sh is present, which is written by
	# the dmtcp_coordinator when I asked for the checkpoint, cause if it isn't
	# then that is a DMTCP bug (or full file system!)
	if [ -f "./dmtcp_restart_script.sh" ] ; then
		logit "Found dmtcp_restart_script.sh. Good!"

		# if it is a symlink, convert it to a real file
		if [ -h "./dmtcp_restart_script.sh" ] ; then
			logit "Converting symlink ./dmtcp_restart_script.sh to real file."
			if [ -x /usr/bin/readlink ] ; then
				realfile=`/usr/bin/readlink ./dmtcp_restart_script.sh`
			else
				realfile=`/bin/readlink ./dmtcp_restart_script.sh`
			fi
			/bin/rm ./dmtcp_restart_script.sh
			/bin/mv "$realfile" "./dmtcp_restart_script.sh"
			if [ -f "$realfile" ] ; then
				logit "Problem: $realfile was not converted to ./dmtcp_restart_script.sh"
			fi
		fi
		logit "Next invocation is restart!"
	else
		logit "Didn't find dmtcp_restart_script.sh. Next invocation is initial!"
		logit "This is bad and means DMTCP failed to write it, or a full fs."
	fi

	logit "About to exit 0 in signal handler after sleeping 2 seconds."
	logit "Checkpointing continuation on $host"
	logitnohdr "                                        |"
	logitnohdr "                                        V"
	logitnohdr "                                        *"
	/bin/sleep 2
	exit 0
}

# If we've already checkpointed once, then return true, otherwise, false.
is_restart()
{
	if [ -f "./dmtcp_restart_script.sh" ]; then
		return 1
	fi

	return 0
}

###############################################################
# start
###############################################################

# This goes to the stdout specified in the job.sub file, since that file
# gets truncated every restart, we keep the useful information in a log file
# specified to the shim script.
echo "Please see the file $LOG for what happened to this job."
echo "Shim Script Version: $VERSION"

# run or restart the job.
is_restart
restart=$?

# Now what are we tasked to do?
if [ $restart -eq 0 ]; then
	logitnohdr "                                        -"
	logitnohdr "                                        |"
	logitnohdr "                                        V"
	logit "Initial shim script (Version: $VERSION) invocation on $host"
else
	logitnohdr "                                        |"
	logitnohdr "                                        V"
	logit "Resumption shim script (Version: $VERSION) invocation on $host"
fi

logit "Shim script start [pid: $$]"
logit "STDIN: $STDIN"
logit "STDOUT: $STDOUT"
logit "STDERR: $STDERR"

# ensure we have everything we need, otherwise bail!
assert_dmtcp_manifest

# WARNING:
# Close the gcb inherited fd which was left open to an unlinked file when the
# job executes under a gcb enabled execute node.
# DMTCP croaks on it because it can't restore it.
FD=`/usr/bin/env | /bin/grep CB_INHERITFILE | /bin/sed -e 's/.*=//'`
if [ -n "$FD" ] ; then
	logit "Closing secretly inherited GCB fd $FD"
	IFD=`/usr/bin/expr "$FD"`
	eval "exec $IFD>&-"
	if [ $? = 1 ] ; then
		logit "Oops! The close didn't happen properly! Attempting to continue."
	else
		logit "GCB fd $FD closed."
	fi
fi

# What kind of a checkpoint signature does this machine have?
ckptsignature

# Each job will have its own dmtcp coordinator that exits when the job finishes.
logit "starting the dmtcp_coordinator process"
# We start it on an ephemeral port and save it for later.
port=`${dmtcp_coordinator} --port 0 --exit-on-last --interval ${CKPTINT} --daemon 2>&1 | grep "Port:" | /bin/sed -e 's/Port://g' -e 's/[ \t]//g'`

if [ $? -eq 0 ]; then
	logit "started dmtcp_coordinator on port $port with checkpoint interval ${CKPTINT}"
else
	logit "could not start dmtcp_coordinator"
fi

# see if the port if defined
if [ "x$port" = "x" ]; then
	logit "dmtcp_coordinator port is unknown! Aborting."
	exit 1
fi

# give it time to wake up.
delay 2

# catch sigint, checkpoint and exit on vacate.
logit "Setting signal trap for SIGINT [2]"
trap checkpoint 2

if [ $restart -eq 0 ]; then
	logit "running application for the first time"

	# Don't run this with runcmd, since I want the stdout/err of the process to
	# flow through this shell.
	# XXX if there are literal spaces in the arguments, this screws up!
	# XXX put --join back once https://bugs.debian.org/765741 is resolved
	${dmtcp_checkpoint} --port $port "$@" <$STDIN 1>$STDOUT 2>$STDERR &
	wait %1
	ret=$?

	logit "wait returned with value $ret!"

	if [ $ret -gt 128 ] ; then
		logit "got vacate signal, waiting until signal handler exit"
		while [ true ]
		do
			:
		done
	else
		logit "Removing DMTCP checkpoint files"
		rm -f dmtcpConTable.${DMTCP_PREFIX_ID}*
	fi

	logit "Program terminated."

	exit $ret

else
	# We inherit DMTCP_PREFIX_ID from the environment
	logit "Restarting application: "
	DMTCP_PORT=$port
	DMTCP_HOST=$host
	DMTCP_RESTART_DIR=./
	# This next line is because the dmtcp_restart_script.sh program expects
	# to find dmtcp_restart in the path.
	PATH=.:$PATH
	export DMTCP_PORT
	export DMTCP_HOST
	export DMTCP_RESTART_DIR
	export PATH
	logit "PATH=.:$PATH"
	logit "DMTCP_PORT=$DMTCP_PORT"
	logit "DMTCP_HOST=$DMTCP_HOST"
	logit "DMTCP_RESTART_DIR=$DMTCP_RESTART_DIR"
	logit "./dmtcp_restart_script.sh"
	./dmtcp_restart_script.sh &
	wait %1
	ret=$?
	logit "Application returned: $ret"
	logit "Removing DMTCP checkpoint files"
	rm -f ckpt*${DMTCP_PREFIX_ID}*.dmtcp
	rm -f dmtcp*Table*${DMTCP_PREFIX_ID}*
	rm -f dmtcp*Map*${DMTCP_PREFIX_ID}*
	rm -f dmtcp*List*${DMTCP_PREFIX_ID}*
	rm -f dmtcp_restart_script.sh
	exit $ret;
fi

logit "Should never get here!"
exit 1;