Skip to content

Commit

Permalink
Merge pull request #623 from radical-cybertools/hotfix/issue572
Browse files Browse the repository at this point in the history
Hotfix/issue572
  • Loading branch information
andre-merzky committed May 18, 2015
2 parents 33e5c8d + fbfde2e commit ffff66d
Show file tree
Hide file tree
Showing 7 changed files with 25,570 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
- For a list of open issues and known problems, see
https://github.com/radical-cybertools/radical.pilot/issues?q=is%3Aissue+is%3Aopen+

0.32 Release 2015-05-18
--------------------------------------------------------------------------------

- Hotfix release for MPIRUN_RSH on Stampede (#572).


0.31 Release 2015-04-30
--------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.31
0.32
12 changes: 8 additions & 4 deletions src/radical/pilot/agent/radical-pilot-agent-multicore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1756,7 +1756,6 @@ def construct_command(self, task_exec, task_args, task_numcores,
return dplace_command, None



# ==============================================================================
#
class LaunchMethodMPIRUNRSH(LaunchMethod):
Expand All @@ -1767,13 +1766,18 @@ def __init__(self, name, config, logger, scheduler):

LaunchMethod.__init__(self, name, config, logger, scheduler)


# --------------------------------------------------------------------------
#
def _configure(self):
# mpirun_rsh (e.g. on Gordon@ SDSC)
self.launch_command = self._which('mpirun_rsh')

# mpirun_rsh (e.g. on Gordon@SDSC, Stampede@TACC)
if not self._which('mpirun_rsh'):
raise Exception("mpirun_rsh could not be found")

# We don't use the full pathname as the user might load a different
# compiler / MPI library suite from his CU pre_exec that requires
# the launcher from that version, as experienced on stampede in #572.
self.launch_command = 'mpirun_rsh'

# --------------------------------------------------------------------------
#
Expand Down
164 changes: 164 additions & 0 deletions tests/issue_572.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python

__copyright__ = "Copyright 2015, http://radical.rutgers.edu"
__license__ = "MIT"

import sys
import radical.pilot as rp
import radical.utils as ru

dh = ru.DebugHelper ()

CNT = 0
RUNTIME = 10
SLEEP = 1
CORES = 16
UNITS = 1
SCHED = rp.SCHED_DIRECT_SUBMISSION

RESOURCE = 'xsede.stampede'
PROJECT = 'TG-MCB090174'
QUEUE = 'development'
SCHEMA = 'ssh'


#------------------------------------------------------------------------------
#
def pilot_state_cb (pilot, state):

if not pilot:
return

print "[Callback]: ComputePilot '%s' state: %s." % (pilot.uid, state)

if state == rp.FAILED:
sys.exit (1)


#------------------------------------------------------------------------------
#
def unit_state_cb (unit, state):

if not unit:
return

global CNT

print "[Callback]: unit %s on %s: %s." % (unit.uid, unit.pilot_id, state)

if state in [rp.FAILED, rp.DONE, rp.CANCELED]:
CNT += 1
print "[Callback]: # %6d" % CNT


if state == rp.FAILED:
print "stderr: %s" % unit.stderr
sys.exit(2)


#------------------------------------------------------------------------------
#
def wait_queue_size_cb(umgr, wait_queue_size):

print "[Callback]: wait_queue_size: %s." % wait_queue_size


#------------------------------------------------------------------------------
#
if __name__ == "__main__":

# we can optionally pass session name to RP
if len(sys.argv) > 1:
session_name = sys.argv[1]
else:
session_name = None

# Create a new session. No need to try/except this: if session creation
# fails, there is not much we can do anyways...
session = rp.Session(name=session_name)
print "session id: %s" % session.uid

# all other pilot code is now tried/excepted. If an exception is caught, we
# can rely on the session object to exist and be valid, and we can thus tear
# the whole RP stack down via a 'session.close()' call in the 'finally'
# clause...
try:

pmgr = rp.PilotManager(session=session)
pmgr.register_callback(pilot_state_cb)

pdesc = rp.ComputePilotDescription()
pdesc.resource = RESOURCE
pdesc.cores = CORES
pdesc.project = PROJECT
pdesc.queue = QUEUE
pdesc.runtime = RUNTIME
pdesc.cleanup = False
pdesc.access_schema = SCHEMA

pilot = pmgr.submit_pilots(pdesc)

umgr = rp.UnitManager(session=session, scheduler=SCHED)
umgr.register_callback(unit_state_cb, rp.UNIT_STATE)
umgr.register_callback(wait_queue_size_cb, rp.WAIT_QUEUE_SIZE)
umgr.add_pilots(pilot)

cuds = list()
for unit_count in range(0, UNITS):
cud = rp.ComputeUnitDescription()
cud.pre_exec = [
'module load gromacs',
'echo 2 | trjconv -f tmp.gro -s tmp.gro -o tmpha.gro',
'module load -intel +intel/14.0.1.106',
'export PYTHONPATH=/home1/03036/jp43/.local/lib/python2.7/site-packages',
'module load python/2.7.6',
'export PATH=/home1/03036/jp43/.local/bin:$PATH',
'echo "Using mpirun_rsh: `which mpirun_rsh`"'
]
cud.executable = "/opt/apps/intel14/mvapich2_2_0/python/2.7.6/lib/python2.7/site-packages/mpi4py/bin/python-mpi"
cud.arguments = ["lsdm.py", "-f", "config.ini", "-c",
"tmpha.gro", "-n" "neighbors.nn", "-w", "weight.w"]
cud.cores = 4
cud.mpi = True
cud.input_staging = [
'issue_572_files/config.ini',
'issue_572_files/lsdm.py',
'issue_572_files/tmp.gro'
]
cuds.append(cud)

units = umgr.submit_units(cuds)

umgr.wait_units()

for cu in units:
print "* Task %s state %s, exit code: %s, started: %s, finished: %s" \
% (cu.uid, cu.state, cu.exit_code, cu.start_time, cu.stop_time)

except Exception as e:
# Something unexpected happened in the pilot code above
print "caught Exception: %s" % e
raise

except (KeyboardInterrupt, SystemExit) as e:
# the callback called sys.exit(), and we can here catch the
# corresponding KeyboardInterrupt exception for shutdown. We also catch
# SystemExit (which gets raised if the main threads exits for some other
# reason).
print "need to exit now: %s" % e

finally:
# always clean up the session, no matter if we caught an exception or
# not.
print "closing session"
session.close ()

# the above is equivalent to
#
# session.close (cleanup=True, terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).


#-------------------------------------------------------------------------------
14 changes: 14 additions & 0 deletions tests/issue_572_files/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[LSDMAP]
;metric used to compute the distance matrix (rmsd, cmd, dihedral)
metric=rmsd

[LOCALSCALE]
;status (constant, kneighbor, user)
status=constant

;constant epsilon used in case status is constant
epsilon=0.15

;value of k in case status is kneighbor
k=10

Loading

0 comments on commit ffff66d

Please sign in to comment.