#!/usr/bin/env bash
##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
##
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
##
##    http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************

# This is a script to run OpenMPI jobs under the HTCondor parallel universe.
# OpenMPI assumes that a full install is available on all execute nodes.

## sample submit script
#universe = parallel
#executable = openmpiscript
#arguments = actual_mpi_job arg1 arg2 arg3
#getenv = true
#
#should_transfer_files = yes
#transfer_input_files = actual_mpi_job
#when_to_transfer_output = on_exit_or_evict
#
#output = out.$(NODE)
#error  = err.$(NODE)
#log    = log
#
#notification = never
#machine_count = 8
#queue
##

## configuration notes
# $MPDIR points to the location of the OpenMPI install
# You may set it manually (not recommended)
#MPDIR=/usr/lib64/openmpi
# The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended)
MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH)

# $EXINT is a comma-delimited list of excluded network interfaces.
# If your mpi jobs are hanging, OpenMPI may be trying to use too many
# network interfaces to communicate between nodes.
# You may set it manually (not recommended)
#EXINT="docker0,virbr0"
# The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended)
EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES)

# We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp
# so that OpenMPI caches all data under the user's scratch directory.
# Not having /tmp mounted under scratch can also lead to unlink errors,
# which may hang mpi jobs.
_USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH)
if [ -z $_USE_SCRATCH ]; then
    echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config"
elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then
    echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH"
fi

# If MPDIR is not set, then use a default value
if [ -z $MPDIR ]; then
    echo "WARNING: Using default value for \$MPDIR in openmpiscript"
    MPDIR=/usr/lib64/openmpi
fi
PATH=$MPDIR/bin:.:$PATH
export PATH

# If EXINT is not set, then use some default values
if [ -z $EXINT ]; then
    echo "WARNING: Using default values for \$EXINT in openmpiscript"
    EXINT="docker0,virbr0"
fi

# The condor_ssh and sshd.sh helper scripts reside in $(LIBEXEC)
CONDOR_SSH=$(condor_config_val libexec)
CONDOR_SSH=$CONDOR_SSH/condor_ssh

SSHD_SH=$(condor_config_val libexec)
SSHD_SH=$SSHD_SH/sshd.sh
##

# Set up SSHD on the node
. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS

# Set up mpirun cleanup function
_MPIRUN_PID=0
mpirun_cleanup() {
    echo "Caught SIGTERM, cleaning up..."
    if [ "$_MPIRUN_PID" -ne "0" ]; then

        # Send SIGTERM to mpirun
        echo "Sending SIGTERM to mpirun (${_MPIRUN_PID})..."
        kill -s SIGTERM $_MPIRUN_PID

        # Give mpirun 60 seconds to exit nicely before proceeding
        echo "Waiting for mpirun to exit..."
        for i in {1..12}; do
            kill -0 $_MPIRUN_PID 2> /dev/null # Returns 0 if PID is running
            if [ "$?" -ne "0" ]; then
                break
            fi
            sleep 5
        done
    fi

    # Cleanup sshd
    echo "Cleaning up sshd files..."
    sshd_cleanup
    rm -f machines

    echo "Exiting early."
    exit 1
}

# If not the head node, just sleep forever to let the SSHDs run
if [ $_CONDOR_PROCNO -ne 0 ]
then
    wait
    sshd_cleanup
    exit 0
# If the head node, then set the trap to cleanup mpirun (also does sshd_cleanup)
else
    trap mpirun_cleanup SIGTERM
fi

EXECUTABLE=$1
shift

# The binary is copied but the executable flag may be cleared.
chmod +x $EXECUTABLE

# Set the location of the contact file
CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export CONDOR_CONTACT_FILE

# The second field in the contact file contains the node ranks.
# mpirun will use a list a of these node ranks,
# and condor_ssh will translate them into a hostname:port.
sort -n -k 1 < $CONDOR_CONTACT_FILE | awk '{print $1}' > machines

# Check for which ssh agent to use because one or the other
# have each been deprecated at one OpenMPI version or another.
_MCA_FAIL=true
for mca_ssh_agent in orte_rsh_agent plm_rsh_agent
do
    if $(ompi_info -a | grep \"${mca_ssh_agent}\" 1>/dev/null 2>&1)
    then
        if $(ompi_info -a | grep \"${mca_ssh_agent}\" | grep deprecated 1>/dev/null 2>&1); then continue; fi

        _MCA_FAIL=false

        # set MCA values for running on HTCondor
        export OMPI_MCA_plm_rsh_no_tree_spawn="true" # disable ssh tree spawn
        export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude network interfaces

        # optionally set MCA values for increasing mpirun verbosity
        #export OMPI_MCA_plm_base_verbose=30
        #export OMPI_MCA_btl_base_verbose=30

        # run mpirun in the background and wait for it to exit
        mpirun -v --prefix $MPDIR --mca $mca_ssh_agent $CONDOR_SSH -n $_CONDOR_NPROCS -hostfile machines $EXECUTABLE $@ &
	_MPIRUN_PID=$!
	wait $_MPIRUN_PID
	
        break
    fi
done

if $_MCA_FAIL
then
    echo could not find a suitable MCA ssh agent
    exit 255
fi

sshd_cleanup
rm -f machines

exit $?
