From 6ffb0d05077598c54d947e3bff1c2a13f908fe95 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 14 Oct 2017 10:16:49 -0700 Subject: [PATCH] Ensure that the pmix server system-level rendezvous file is only output by the HNP as (at least for slurm on cray) a daemon could be colocated with the HNP and overwrite the file. Update the scaling.pl script to only use the system-level rendezvous so it doesn't get rejected by a colocated daemon Signed-off-by: Ralph Castain --- contrib/scaling/scaling.pl | 6 +++--- orte/orted/pmix/pmix_server.c | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/contrib/scaling/scaling.pl b/contrib/scaling/scaling.pl index 9d0a26cb3b..710f036a45 100755 --- a/contrib/scaling/scaling.pl +++ b/contrib/scaling/scaling.pl @@ -27,7 +27,7 @@ my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op); my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1"); my @starterlist = qw(mpirun prun srun aprun); my @starteroptionlist = (" --novm", - "", + " --system-server-only", " --distribution=cyclic --ntasks-per-node=", " -N"); @@ -267,7 +267,7 @@ foreach $starter (@starters) { # if we are going to use the dvm, then we if ($starter eq "prun") { # need to start it - $cmd = "orte-dvm -mca pmix_system_server 1 2>&1 &"; + $cmd = "orte-dvm --system_server 2>&1 &"; if ($myresults) { print FILE "\n\n$cmd\n"; } @@ -341,7 +341,7 @@ foreach $starter (@starters) { } if ($havedvm) { if (!$SHOWME) { - $cmd = "prun --terminate"; + $cmd = "prun --system-server-only --terminate"; system($cmd); } $havedvm = 0; diff --git a/orte/orted/pmix/pmix_server.c b/orte/orted/pmix/pmix_server.c index 8a7a6480b3..8d370c6aef 100644 --- a/orte/orted/pmix/pmix_server.c +++ b/orte/orted/pmix/pmix_server.c @@ -273,8 +273,10 @@ int pmix_server_init(void) } /* if requested, tell the server to drop a system-level - * PMIx connection point */ - if (orte_pmix_server_globals.system_server) { + * PMIx connection point - only do this for the HNP as, in + * at least one case, a daemon can be colocated with the + * HNP and would overwrite the server rendezvous file */ + if (orte_pmix_server_globals.system_server && ORTE_PROC_IS_HNP) { kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SERVER_SYSTEM_SUPPORT); kv->type = OPAL_BOOL;