#!/bin/tcsh -f
# JLdL 08Dec13.
#
# Copyright (C) 2005-2013 by Jorge L. deLyra <delyra@fma.if.usp.br>.
# This program may be copied and/or distributed freely. See the
# _ terms and conditions in /usr/share/doc/<package>/copyright.
#
# This program runs arbitrary apt-get operations on all the nodes of a cluster,
# _ using chroot; it passes all but the help and configuration file options
# _ unchanged to apt-get; it calls an anchor script within each node to do the
# _ preparatory and finalization work, before and after an apt-get operation.
#
# NOTE: this program does not handle error messages
# _ in languages other than English.
#
# If this script is interrupted, do a clean exit,
# _ unmounting all the filesystems of the node.
onintr cleanexit
#
# Record the name this script was called with.
set name = `basename $0`
#
# Initialize variables for the configuration file.
set conflag = 0
set confile = "/etc/cluster.conf"
#
# Initialize a variable for the list of apt-get options and arguments.
set agoparg = ""
#
# Initialize a flag for the type of mount to be done; the default
# _ is to use local bind mounts rather than loopback NFS mounts.
set bindmnt = 1
#
# Initialize variables for pausing at the end.
set pseflag = 0
set pseargm = 0
set psetime = 0
#
# Process the command-line arguments.
foreach cla ( $* )
    #
    # Detect options.
    if ( "`echo -n $cla | cut -c 1`" == "-" ) then
	#
	# If we got here with the argument flag up, there is an error.
	if ( $conflag == 1 ) then
	    echo "${name}: ERROR: option -C requires an argument"
	    exit 1
	else if ( $pseargm == 1 ) then
	    #
	    # In this case we just assume that the argument is
	    # _ missing and attribute the default value of 0.
	    set psetime = 0
	    #
	    # Lower the pause argument flag.
	    set pseargm = 0
	endif
	#
	# Now process the options.
	switch ( $cla )
	case "-h":
	case "--help":
	    #
	    # Print a usage message.
	    echo "usage: $name [-C <config>] [-B|-N] [-E|-P [n]] <apt-get-options-and-arguments>"
	    echo "       -C: use alternate configuration file <config>"
	    echo "       -B: use local 'bind' mounts for the node filesystems"
	    echo "       -N: use loopback NFS mounts for the node filesystems"
	    echo "       -E: exit immediately after executing the task"
	    echo "       -P: pause for n seconds after execution and then exit"
	    echo "               If n=0 then wait for [Enter] after execution"
	    echo "       run apt-get operations on the nodes of a cluster,"
	    echo "       using chroot to access the node filesystems;"
	    echo "       you can use any apt-get options and arguments, run"
	    echo "       the commands 'apt-get -h' or 'man apt-get' to look"
	    echo "       up all the possible options and arguments; in order"
	    echo "       to get the details run 'man $name'"
	    exit 0
	    breaksw
	case "-C":
	case "--Config-file":
	    #
	    # Raise the flag.
	    set conflag = 1
	    breaksw
	case "-B":
	case "--bind-mounts":
	    #
	    # Raise the bind-mounts flag.
	    set bindmnt = 1
	    breaksw
	case "-N":
	case "--NFS-mounts":
	    #
	    # Lower the bind-mounts flag.
	    set bindmnt = 0
	    breaksw
	case "-E":
	case "--Exit":
	    #
	    # Lower the pause flag.
	    set pseflag = 0
	    #
	    # Zero the pause time.
	    set psetime = 0
	    breaksw
	case "-P":
	case "--Pause":
	    #
	    # Raise the pause flag.
	    set pseflag = 1
	    #
	    # Raise the pause argument flag.
	    set pseargm = 1
	    breaksw
	default:
	    #
	    # Accumulate apt-get options.
	    set agoparg = ( $agoparg $cla )
	    breaksw
	endsw
    #
    # Process non-option arguments.
    else
	#
	# Get the arguments of options.
	if ( $conflag == 1 ) then
	    #
	    # Set the configuration file.
	    set confile = $cla
	    #
	    # Lower the flag.
	    set conflag = 0
	else if ( $pseargm == 1 ) then
	    #
	    # Check whether the argument is a number.
	    echo $cla | grep -q '^[0-9]*$'
	    #
	    # If it is, then set the pause time; otherwise, set the
	    # _ time to the default value and pass on the argument.
	    if ( $status == 0 ) then
		set psetime = $cla
	    else
		set psetime = 0
		set hlfsdirs = ( $hlfsdirs $cla )
	    endif
	    #
	    # Lower the pause argument flag.
	    set pseargm = 0
	else
	    #
	    # Accumulate apt-get arguments.
	    set agoparg = ( $agoparg $cla )
	endif
    endif
end
#
# If we got here with the argument flag up, there is an error.
if ( $conflag == 1 ) then
    echo "${name}: ERROR: option -C requires an argument"
    exit 1
else if ( $pseargm == 1 ) then
    #
    # In this case we just assume that the argument is
    # _ missing and attribute the default value of 0.
    set psetime = 0
    #
    # Lower the argument flag.
    set pseargm = 0
endif
#
# Source the configuration file; this must define the following variables:
# _ nick_name; virt_node; cluster_root; mount_points.
if ( -r $confile ) then
    source $confile
else
    echo "${name}: ERROR: cannot read configuration file $confile"
    exit 1
endif
#
# Do some simple error detection: check that the necessary
# _ variables are defined in the configuration file.
if ( ! $?nick_name ) then
    echo "${name}: ERROR: nick_name not defined in configuration file"
    exit 1
endif
if ( ! $?virt_node ) then
    echo "${name}: ERROR: virt_node not defined in configuration file"
    exit 1
endif
if ( ! $?cluster_root ) then
    echo "${name}: ERROR: cluster_root not defined in configuration file"
    exit 1
endif
if ( ! $?mount_points ) then
    echo "${name}: ERROR: mount_points not defined in configuration file"
    exit 1
endif
#
# Give default values to the optional configuration variables.
if ( ! $?cluster_server ) then
    set cluster_server = `hostname -s`
endif
if ( ! $?extra_mounts ) then
    set extra_mounts = ()
endif
if ( ! $?mount_retries ) then
    set mount_retries = 12
endif
if ( ! $?retry_timeout ) then
    set retry_timeout = 10
endif
#
# Get the number of digits in the node numbers.
set ndig = `echo -n $virt_node | wc -c`
#
# Build the regular expression for the node numbers.
set node_digs = "[0-9]"
set idig = 1
while ( $idig < $ndig )
    set node_digs = "${node_digs}[0-9]"
    @ idig = $idig + 1
end
#
# Separate the extra NFS mounts into local mounts, which can be
# _ made by bind mounts here, and remote mounts, which must be
# _ made by NFS within the chroot cage of the node.
set local_mounts = ()
set remote_mounts = ()
#
# We only have to do this if bind mounts are going to be used.
if ( $bindmnt ) then
    foreach extra_mount ( $extra_mounts )
	#
	# Check for the appropriate symlink within the cluster root.
	if ( -l $cluster_root/$extra_mount ) then
	    set local_mounts = ( $local_mounts $extra_mount )
	else
	    set remote_mounts = ( $remote_mounts $extra_mount )
	endif
    end
endif
#
# Define a separator line.
set sep = "--------------------------------------------------------------------------------"
#
# Define the location of the library.
set libdir = /usr/lib/cluster
#
# Define the location of the library within the nodes.
set cldir = /lib/cluster
#
# Define a target for egrep; start with one for the system mounts; do
# _ not use the hostname of the server here, in private networks the
# _ hostname associated to the other interface may be shown instead.
set etarg = ":${cluster_root}/[^ ]* $cluster_root/$node_digs/[^ ]* nfs"
#
# This is for the case of the use of local bind mounts; note that this
# _ assumes that ext2, ext3 or ext4 filesystemas are being used.
set etarg = "^/dev/[^ ]* $cluster_root/$node_digs/[^ ]* ext"
#
# The proc filesystem mount must be treated separately, since it is
# _ of a different type and will not be picked by the regexp above.
set etarg = "$etarg|^proc $cluster_root/$node_digs/proc proc"
#
# The sys filesystem mount must be treated separately, since it is of
# _ a different type and will not be picked by the regexps above.
set etarg = "$etarg|^sysfs $cluster_root/$node_digs/sys sysfs"
#
# The pts filesystem mount must be treated separately, since it is of
# _ a different type and will not be picked by the regexps above.
set etarg = "$etarg|^devpts $cluster_root/$node_digs/dev/pts devpts"
#
# Add one target for each extra NFS mount.
foreach extra_mount ( $extra_mounts )
    set etarg = "$etarg| $cluster_root/$node_digs/$extra_mount nfs "
end
#
# Check for spurious mounts before starting.
cat /proc/mounts | egrep -q "$etarg"
if ( $status == 0 ) then
    echo "${name}: ERROR: there are spurious chroot mounts:"
    cat /proc/mounts | egrep "$etarg"
    exit 1
endif
#
# Go to the root of the cluster.
cd $cluster_root
#
# Define a list of the masking mounts to be made:
# _ /var/run: to avoid daemons dying off on the server;
# _ /var/yp/binding: to avoid spurious errors with NIS.
set mask_mounts = ( var/run var/yp/binding )
#
# Verify whether or not there is a masking directory for the
# _ /var/run directory of the nodes and, if not, make one.
if ( ! -d fake-var-run ) then
    echo "${name}: WARNING: missing $cwd/fake-var-run directory, making one"
    if ( -e fake-var-run ) then
	mv -f fake-var-run fake-var-run.WRONG
    endif
    mkdir fake-var-run
    ln -s $libdir/fake-var-run.README fake-var-run/README
endif
#
# Verify whether or not there is a masking directory for the directory
# _ /var/yp/binding of the nodes and, if not, make one.
if ( ! -d fake-var-yp-binding ) then
    echo "${name}: WARNING: missing $cwd/fake-var-yp-binding directory, making one"
    if ( -e fake-var-yp-binding ) then
	mv -f fake-var-yp-binding fake-var-yp-binding.WRONG
    endif
    mkdir fake-var-yp-binding
    ln -s $libdir/fake-var-yp-binding.README fake-var-yp-binding/README
endif
#
# Loop over the nodes.
foreach node ( $node_digs )
    #
    # Print out a progress report separator.
    echo $sep
    echo current node is: $nick_name$node
    #
    # If bind-mounts is in effect, then mount all local filesystems
    # _ here, rather than within the anchoring program.
    if ( $bindmnt ) then
	#
	# Preparation: mount filesystems for the node.
	echo -n "  Mounting filesystems for the node:\n "
	#
	# Loop over the filesystems to be mounted; note the inclusion
	# _ of the /proc, /sys and /dev/pts filesystems.
	foreach fs ( proc sys dev/pts $mount_points $mask_mounts $local_mounts )
	    #
	    # Write out some progress report.
	    echo -n " /$fs"
	    #
	    # Start an error counter.
	    @ ec = 0
	    #
	    # An error-handling label.
	    again1:
	    #
	    # Mount the filesystems, taking care of each type in turn.
	    #
	    # Take care of the /proc, /sys and /dev/pts filesystems: it is faster
	    # _ to make bind mounts, rather than fresh proc, sys or pts mounts.
	    if ( $fs == proc || $fs == sys || $fs == dev/pts ) then
		set error = `mount -n --bind /$fs $cluster_root/$node/$fs |& \
				sed -e 's|[()]|_|g' | cat`
	    #
	    # Take care of the system mounts.
	    else if ( $fs == `echo "$mount_points" | tr ' ' '\n' | grep "^$fs"'$'` ) then
		set error = `mount -n --bind $cluster_root/$fs/$node $cluster_root/$node/$fs |& \
				sed -e 's|[()]|_|g' | cat`
	    #
	    # Take care of the masking mounts.
	    else if ( $fs == `echo "$mask_mounts" | tr ' ' '\n' | grep "^$fs"'$'` ) then
		echo -n "(mask)"
		if ( -d $cluster_root/$node/$fs ) then
		    set fn = `echo -n $fs | tr '/' '-'`
		    set error = `mount -n --bind $cluster_root/fake-$fn $cluster_root/$node/$fs |& \
				    sed -e 's|[()]|_|g' | cat`
		endif
	    #
	    # Take care of the local extra mounts.
	    else
		set error = `mount -n --bind $cluster_root/$fs $cluster_root/$node/$fs |& \
				sed -e 's|[()]|_|g' | cat`
	    endif
	    #
	    # Handle mount errors.
	    if ( "$error" != "" ) then
		if ( $ec == 0 ) echo ""
		echo -n "  WARNING: cannot mount filesystem $cluster_root/$node/${fs}: "
		#
		# Increment the error counter.
		@ ec = $ec + 1
		#
		# Try again up to mount_retries times at retry_timeout-second intervals.
		if ( $ec <= $mount_retries ) then
		    echo "  trying again in $retry_timeout seconds..."
		    echo "  (error message was: $error)"
		    sleep $retry_timeout
		    goto again1
		else
		    echo "  WARNING: failed $mount_retries times, quitting..."
		    goto cleanexit
		endif
	    endif
	end
	#
	# End the progress-report line.
	echo ""
    endif
    #
    # Do a chroot to the node and execute the anchor script, passing
    # _ all the necessary command-line arguments; note the inclusion
    # _ of the /proc, /sys and /dev/pts filesystems in the case when
    # _ bind-mounts is not in effect.
    if ( $bindmnt ) then
	chroot $cluster_root/$node $cldir/multi-apt-get-chroot.anchor \
	    $cluster_server $cluster_root "$remote_mounts" "" \
	    $mount_retries $retry_timeout $agoparg
    else
	chroot $cluster_root/$node $cldir/multi-apt-get-chroot.anchor \
	    $cluster_server $cluster_root "proc sys dev/pts $mount_points $extra_mounts" "$mask_mounts" \
	    $mount_retries $retry_timeout $agoparg
    endif
    #
    # The clean-exit label.
    cleanexit:
    #
    # If bind-mounts is in effect, then unmount here all the
    # _ local filesystems which were mounted before.
    if ( $bindmnt ) then
	#
	# Finalization: unmount filesystems for the node.
	echo -n "  Unmounting filesystems for the node:\n "
	#
	# Loop over the mounted filesystems.
	foreach fs ( $local_mounts $mask_mounts $mount_points dev/pts sys proc )
	    #
	    # Write out some progress report.
	    echo -n " /$fs"
	    #
	    # Start an error counter.
	    @ ec = 0
	    #
	    # An error-handling label.
	    again2:
	    #
	    # Try a straight unmount.
	    set error = `umount -n $cluster_root/$node/$fs |& sed -e 's|[()]|_|g' | cat`
	    #
	    # Handle unmount errors; note the _reversed_ grep searches in order to
	    # _ filter out some error messages which require no further action.
	    if ( "$error" != "" && \
		"`echo $error | grep -v 'umount: $cluster_root/$node/${fs}: not found'`" != "" && \
		"`echo $error | grep -v 'umount: $cluster_root/$node/${fs}: not mounted'`" != "" ) then
		if ( $ec == 0 ) echo ""
		echo -n "  WARNING: cannot unmount filesystem $cluster_root/$node/${fs}: "
		#
		# Increment the error counter.
		@ ec = $ec + 1
		#
		# Try again up to mount_retries times at retry_timeout-second intervals.
		if ( $ec <= $mount_retries ) then
		    echo "  trying again in $retry_timeout seconds..."
		    echo "  (error message was: $error)"
		    sleep $retry_timeout
		    goto again2
		else
		    #
		    # If waiting does not work, do a lazy unmount.
		    echo "  failed $mount_retries times, giving up."
		    echo "  WARNING: doing a lazy unmount for $cluster_root/$node/$fs..."
		    umount -nl $cluster_root/$node/$fs
		endif
	    endif
	end
	#
	# End the progress-report line.
	echo ""
    endif
    #
    # Check whether there are any stray pid files left.
    find $cluster_root/fake-var-run/ -name \*.pid -exec \
	echo "${name}: WARNING: stray PID file found:" \{\} \;
end
#
# Print a final separator.
echo $sep
#
# Go back to the original directory.
cd -
#
# Check and warn about spurious mounts at the end.
cat /proc/mounts | egrep -q "$etarg"
if ( $status == 0 ) then
    echo "${name}: WARNING: there are chroot mounts left over:"
    cat /proc/mounts | egrep "$etarg"
endif
#
# If the pause flag is up, pause before exiting.
if ( $pseflag ) then
    #
    # If there is no pause time, wait for ever;
    # _ else wait for the given time.
    if ( "$psetime" == 0 ) then
	echo -n "Hit [Enter] to exit: "
	set iwait = $<
    else
	sleep $psetime
    endif
endif
