1
1

Merge pull request #4382 from rhc54/topic/scaling

Update the scaling script to avoid use of "system" command
Этот коммит содержится в:
Ralph Castain 2017-10-23 22:43:25 -05:00 коммит произвёл GitHub
родитель ac348da13a 0353be9704
Коммит 70c455938b
3 изменённых файлов: 48 добавлений и 43 удалений

Просмотреть файл

@ -24,7 +24,7 @@ my $ppn = 1;
my @csvrow; my @csvrow;
my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op); my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op);
my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1"); my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca pmix_base_collect_data 0", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1 -mca pmix_base_collect_data 0");
my @starterlist = qw(mpirun prun srun aprun); my @starterlist = qw(mpirun prun srun aprun);
my @starteroptionlist = (" --novm", my @starteroptionlist = (" --novm",
" --system-server-only", " --system-server-only",
@ -87,6 +87,7 @@ my $option;
my $havedvm = 0; my $havedvm = 0;
my @starters; my @starters;
my @starteroptions; my @starteroptions;
my $pid;
# if they explicitly requested specific starters, then # if they explicitly requested specific starters, then
# only use those # only use those
@ -267,12 +268,17 @@ foreach $starter (@starters) {
# if we are going to use the dvm, then we # if we are going to use the dvm, then we
if ($starter eq "prun") { if ($starter eq "prun") {
# need to start it # need to start it
$cmd = "orte-dvm --system_server 2>&1 &";
if ($myresults) { if ($myresults) {
print FILE "\n\n$cmd\n"; print FILE "\n\norte-dvm --system-server\n";
} }
if (!$SHOWME) { if (!$SHOWME) {
system($cmd); unless ($pid = fork) {
unless (fork) {
exec "orte-dvm --system-server 2>&1";
die "no exec";
}
exit 0;
}
$havedvm = 1; $havedvm = 1;
} }
# give it a couple of seconds to start # give it a couple of seconds to start
@ -297,7 +303,7 @@ foreach $starter (@starters) {
# pre-position the executable # pre-position the executable
$cmd = $starter . $starteroptions[$index] . " $test 2>&1"; $cmd = $starter . $starteroptions[$index] . " $test 2>&1";
my $error; my $error;
$error = system($cmd); $error = `$cmd`;
if (0 != $error) { if (0 != $error) {
if ($myresults) { if ($myresults) {
print FILE "Command $cmd returned error $error\n"; print FILE "Command $cmd returned error $error\n";
@ -342,7 +348,8 @@ foreach $starter (@starters) {
if ($havedvm) { if ($havedvm) {
if (!$SHOWME) { if (!$SHOWME) {
$cmd = "prun --system-server-only --terminate"; $cmd = "prun --system-server-only --terminate";
system($cmd); my $rc = `$cmd`;
waitpid($pid, 0);
} }
$havedvm = 0; $havedvm = 0;
} }

Просмотреть файл

@ -59,7 +59,7 @@
#include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/rcache.h"
#include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/base/base.h"
#include "opal/mca/btl/base/base.h" #include "opal/mca/btl/base/base.h"
#include "opal/mca/pmix/pmix.h" #include "opal/mca/pmix/base/base.h"
#include "opal/util/timings.h" #include "opal/util/timings.h"
#include "opal/util/opal_environ.h" #include "opal/util/opal_environ.h"
@ -366,8 +366,8 @@ static int ompi_register_mca_variables(void)
static void fence_release(int status, void *cbdata) static void fence_release(int status, void *cbdata)
{ {
volatile bool *active = (volatile bool*)cbdata; opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
*active = false; OPAL_PMIX_WAKEUP_THREAD(lock);
} }
int ompi_mpi_init(int argc, char **argv, int requested, int *provided) int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
@ -377,9 +377,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
size_t nprocs; size_t nprocs;
char *error = NULL; char *error = NULL;
ompi_errhandler_errtrk_t errtrk; ompi_errhandler_errtrk_t errtrk;
volatile bool active;
opal_list_t info; opal_list_t info;
opal_value_t *kv; opal_value_t *kv;
opal_pmix_lock_t lock;
bool background_fence = false;
OMPI_TIMING_INIT(32); OMPI_TIMING_INIT(32);
@ -682,24 +683,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) { if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
/* execute the fence_nb in the background to collect /* execute the fence_nb in the background to collect
* the data */ * the data */
if (!ompi_async_mpi_init) { background_fence = true;
/* we are going to execute a barrier at the OPAL_PMIX_CONSTRUCT_LOCK(&lock);
* end of MPI_Init. We can only have ONE fence opal_pmix.fence_nb(NULL, true, fence_release, (void*)&lock);
* operation with the identical involved procs
* at a time, so we will need to wait when we
* get there */
active = true;
opal_pmix.fence_nb(NULL, true, fence_release, (void*)&active);
} else {
opal_pmix.fence_nb(NULL, true, NULL, NULL);
}
} else if (!opal_pmix_base_async_modex) { } else if (!opal_pmix_base_async_modex) {
active = true; /* we want to do the modex */
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data, opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
fence_release, (void*)&active); fence_release, (void*)&lock);
OMPI_LAZY_WAIT_FOR_COMPLETION(active); /* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
OPAL_PMIX_DESTRUCT_LOCK(&lock);
} }
} else { /* otherwise, we don't want to do the modex, so fall thru */
} else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) {
opal_pmix.fence(NULL, opal_pmix_collect_all_data); opal_pmix.fence(NULL, opal_pmix_collect_all_data);
} }
@ -866,24 +863,24 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* Next timing measurement */ /* Next timing measurement */
OMPI_TIMING_NEXT("modex-barrier"); OMPI_TIMING_NEXT("modex-barrier");
/* wait for everyone to reach this point - this is a hard /* if we executed the above fence in the background, then
* barrier requirement at this time, though we hope to relax * we have to wait here for it to complete. However, there
* it at a later point */ * is no reason to do two barriers! */
if (!ompi_async_mpi_init) { if (background_fence) {
/* if we executed the above fence in the background, then OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
* we have to wait here for it to complete. However, there OPAL_PMIX_DESTRUCT_LOCK(&lock);
* is no reason to do two barriers! */ } else if (!ompi_async_mpi_init) {
if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) { /* wait for everyone to reach this point - this is a hard
OMPI_LAZY_WAIT_FOR_COMPLETION(active); * barrier requirement at this time, though we hope to relax
* it at a later point */
if (NULL != opal_pmix.fence_nb) {
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
opal_pmix.fence_nb(NULL, false,
fence_release, (void*)&lock);
OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
OPAL_PMIX_DESTRUCT_LOCK(&lock);
} else { } else {
active = true; opal_pmix.fence(NULL, false);
if (NULL != opal_pmix.fence_nb) {
opal_pmix.fence_nb(NULL, false,
fence_release, (void*)&active);
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
} else {
opal_pmix.fence(NULL, false);
}
} }
} }

Просмотреть файл

@ -781,7 +781,8 @@ static int create_app(int argc, char* argv[],
/* Grab all MCA environment variables */ /* Grab all MCA environment variables */
app->env = opal_argv_copy(*app_env); app->env = opal_argv_copy(*app_env);
for (i=0; NULL != environ[i]; i++) { for (i=0; NULL != environ[i]; i++) {
if (0 == strncmp("PMIX_", environ[i], 5)) { if (0 == strncmp("PMIX_", environ[i], 5) ||
0 == strncmp("OMPI_", environ[i], 5)) {
/* check for duplicate in app->env - this /* check for duplicate in app->env - this
* would have been placed there by the * would have been placed there by the
* cmd line processor. By convention, we * cmd line processor. By convention, we