
onto the backend daemons. By default, let mpirun only pack the app_context info and send that to the backend daemons where the mapping will be done. This significantly reduces the computational time on mpirun as it isn't running up/down the topology tree computing thousands of binding locations, and it reduces the launch message to a very small number of bytes. When running -novm, fall back to the old way of doing things where mpirun computes the entire map and binding, and then sends the full info to the backend daemon. Add a new cmd line option/mca param --fwd-mpirun-port that allows mpirun to dynamically select a port, but then passes that back to all the other daemons so they will use that port as a static port for their own wireup. In this mode, we no longer "phone home" directly to mpirun, but instead use the static port to wireup at daemon start. We then use the routing tree to rollup the initial launch report, and limit the number of open sockets on mpirun's node. Update ras simulator to track the new nidmap code Cleanup some bugs in the nidmap regex code, and enhance the error message for not enough slots to include the host on which the problem is found. Update gadget platform file Initialize the range count when starting a new range Fix the no-np case in managed allocation Ensure DVM node usage gets cleaned up after each job Update scaling.pl script to use --fwd-mpirun-port. Pre-connect the daemon to its parent during launch while we are otherwise waiting for the daemon's children to send their "phone home" rollup messages Signed-off-by: Ralph Castain <rhc@open-mpi.org>
337 строки
11 KiB
Perl
Исполняемый файл
337 строки
11 KiB
Perl
Исполняемый файл
#!/usr/bin/env perl
|
|
#
|
|
# Copyright (c) 2012 Los Alamos National Security, Inc.
|
|
# All rights reserved.
|
|
# Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
|
|
|
use strict;
|
|
use Getopt::Long;
|
|
|
|
# globals
|
|
my $num_nodes = 2;
|
|
my $my_arg;
|
|
my $reps = 5;
|
|
my $usedvm = 0;
|
|
my $usesrun = 0;
|
|
my $usempirun = 0;
|
|
my $useaprun = 0;
|
|
my $useaprun = 0;
|
|
my $myapp;
|
|
my $runall = 0;
|
|
my $rawoutput = 0;
|
|
my $myresults = "myresults";
|
|
my $ppn = 1;
|
|
my @csvrow;
|
|
|
|
my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op);
|
|
my @options = ("", "", "", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1");
|
|
my @starterlist = qw(mpirun orterun srun aprun);
|
|
my @starteroptionlist = ("--novm",
|
|
"--hnp file:dvm_uri",
|
|
"--distribution=cyclic -N",
|
|
"-N");
|
|
|
|
# Set to true if the script should merely print the cmds
|
|
# it would run, but don't run them
|
|
my $SHOWME = 0;
|
|
# Set to true to suppress most informational messages.
|
|
my $QUIET = 0;
|
|
# Set to true if we just want to see the help message
|
|
my $HELP = 0;
|
|
|
|
GetOptions(
|
|
"help" => \$HELP,
|
|
"quiet" => \$QUIET,
|
|
"showme" => \$SHOWME,
|
|
"reps=s" => \$reps,
|
|
"dvm" => \$usedvm,
|
|
"srun" => \$usesrun,
|
|
"aprun" => \$useaprun,
|
|
"mpirun" => \$usempirun,
|
|
"myapp=s" => \$myapp,
|
|
"all" => \$runall,
|
|
"results=s" => \$myresults,
|
|
"rawout" => \$rawoutput,
|
|
"ppn=s" => \$ppn,
|
|
) or die "unable to parse options, stopped";
|
|
|
|
if ($HELP) {
|
|
print "$0 [options]
|
|
|
|
--help | -h This help message
|
|
--quiet | -q Only output critical messages to stdout
|
|
--showme Show the actual commands without executing them
|
|
--reps=s Number of times to run each test (for statistics)
|
|
--mpirun Use mpirun (or its equivalent orterun)
|
|
--dvm Use orte-dvm to execute the test
|
|
--srun Use srun (if available) to execute the test
|
|
--arpun Use aprun (if available) to execute the test
|
|
--myapp=s In addition to the standard tests, run this specific application (including any args)
|
|
--all Use all available start commands [default]
|
|
--results=file File where results are to stored in comma-separated value format
|
|
--rawout Provide raw timing output to the file
|
|
--ppn=n Run n procs/node
|
|
";
|
|
exit(0);
|
|
}
|
|
|
|
my $n = 1;
|
|
my $cmd;
|
|
my $starter;
|
|
my $test;
|
|
my $output;
|
|
my @lines;
|
|
my $line;
|
|
my @results;
|
|
my $res;
|
|
my $idx;
|
|
my $option;
|
|
my $havedvm = 0;
|
|
my @starters;
|
|
my @starteroptions;
|
|
|
|
# if they asked for all, then set all starters to requested
|
|
if ($runall) {
|
|
$useaprun = 1;
|
|
$usempirun = 1;
|
|
$usesrun = 1;
|
|
$usedvm = 1;
|
|
}
|
|
|
|
# see which starters are available
|
|
my @path = split(":", $ENV{PATH});
|
|
my $exists = 0;
|
|
my $opt;
|
|
$idx=0;
|
|
foreach $starter (@starterlist) {
|
|
$exists = 0;
|
|
foreach my $path (@path) {
|
|
if ( -x "$path/$starter") {
|
|
$exists = 1;
|
|
last;
|
|
}
|
|
}
|
|
if ($exists) {
|
|
if ($usedvm && $starter eq "orterun") {
|
|
push @starters, $starter;
|
|
$opt = $starteroptionlist[$idx] . " --npernode " . $ppn;
|
|
push @starteroptions, $opt;
|
|
} elsif ($usempirun && $starter eq "mpirun") {
|
|
push @starters, $starter;
|
|
$opt = $starteroptionlist[$idx] . " --npernode " . $ppn;
|
|
push @starteroptions, $opt;
|
|
} elsif ($useaprun && $starter eq "aprun") {
|
|
push @starters, $starter;
|
|
$opt = $starteroptionlist[$idx] . " " . $ppn;
|
|
push @starteroptions, $opt;
|
|
} elsif ($usesrun && $starter eq "srun") {
|
|
push @starters, $starter;
|
|
$opt = $starteroptionlist[$idx] . " " . $ppn;
|
|
push @starteroptions, $opt;
|
|
}
|
|
}
|
|
$idx = $idx + 1;
|
|
}
|
|
|
|
# bozo check
|
|
if (scalar @starters == 0) {
|
|
print "No available starters\n";
|
|
exit;
|
|
}
|
|
|
|
# if they gave us an app, add it to the list of tests
|
|
if ($myapp) {
|
|
push @tests, $myapp;
|
|
}
|
|
|
|
if ($myresults) {
|
|
# open the results file
|
|
open FILE, ">$myresults" || die "file could not be opened";
|
|
}
|
|
|
|
# determine the number of nodes - doesn't
|
|
# matter which starter we use
|
|
$cmd = "mpirun --novm --pernode hostname";
|
|
$output = `$cmd`;
|
|
@lines = split(/\n/, $output);
|
|
$num_nodes = $#lines + 1;
|
|
|
|
# get the local date and time
|
|
my ($sec,$min,$hour,$day,$month,$yr19,@rest) = localtime(time);
|
|
|
|
my $pstarts = join(", ", @starters);
|
|
# start by printing out the resulting configuration
|
|
print "\n--------------------------------------------------\n";
|
|
print "\nTest configuration:\n";
|
|
print "\tDate:\t" . "$day-".++$month. "-".($yr19+1900) . " " . sprintf("%02d",$hour).":".sprintf("%02d",$min).":".sprintf("%02d",$sec) . "\n";;
|
|
print "\tNum nodes:\t" . $num_nodes . "\n";
|
|
print "\tStarters:\t" . $pstarts . "\n";
|
|
print "\n--------------------------------------------------\n";
|
|
|
|
# and tag the output file as well
|
|
if ($myresults) {
|
|
print FILE "Test configuration:\n";
|
|
print FILE "Date:\t" . "$day-".++$month. "-".($yr19+1900) . " " . sprintf("%02d",$hour).":".sprintf("%02d",$min).":".sprintf("%02d",$sec) . "\n";;
|
|
print FILE "Num nodes:\t" . $num_nodes . "\n";
|
|
print FILE "Starters:\t" . $pstarts . "\n";
|
|
}
|
|
|
|
my $index = 0;
|
|
|
|
sub runcmd()
|
|
{
|
|
for (1..$reps) {
|
|
$output = `$cmd`;
|
|
if ($myresults && $rawoutput) {
|
|
print FILE $n . " " . $output . "\n";
|
|
}
|
|
@lines = split(/\n/, $output);
|
|
foreach $line (@lines) {
|
|
if (0 <= index($line, "real") ||
|
|
0 <= index($line, "elapsed")) {
|
|
# we know that at least one item of interest is
|
|
# in this line, so let's look for it - start
|
|
# by getting rid of any leading whitespace
|
|
$line =~ s/^\s+//;
|
|
@results = split (/ +/,$line);
|
|
$idx = 0;
|
|
foreach $res (@results) {
|
|
# we are only interested in the real or elapsed time
|
|
my $strloc = index($res, "real");
|
|
if (0 <= $strloc) {
|
|
# some systems put the number in front of
|
|
# this word, and some append the word to
|
|
# the number - consider both cases
|
|
if (0 == $strloc) {
|
|
if (0 == $idx) {
|
|
# it must be in the next location
|
|
push @csvrow,$results[1];
|
|
} else {
|
|
# it must be in the prior location
|
|
push @csvrow,$results[$idx-1];
|
|
}
|
|
} else {
|
|
# take the portion of the string up to the tag
|
|
push @csvrow,substr($res, 0, $strloc);
|
|
}
|
|
} else {
|
|
$strloc = index($res, "elapsed");
|
|
if (0 <= $strloc) {
|
|
# some systems put the number in front of
|
|
# this word, and some append the word to
|
|
# the number - consider both cases
|
|
if (0 == $strloc) {
|
|
if (0 == $idx) {
|
|
# it must be in the next location
|
|
push @csvrow,$results[1];
|
|
} else {
|
|
# it must be in the prior location
|
|
push @csvrow,$results[$idx-1];
|
|
}
|
|
} else {
|
|
# take the portion of the string up to the tag
|
|
push @csvrow,substr($res, 0, $strloc);
|
|
}
|
|
}
|
|
}
|
|
$idx = $idx + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
# we have now completed all the reps, so log the results
|
|
if ($myresults) {
|
|
my $myout;
|
|
my $mycnt=0;
|
|
while ($mycnt <= $#csvrow) {
|
|
if (0 == $mycnt) {
|
|
$myout = $csvrow[$mycnt];
|
|
} else {
|
|
$myout = $myout . "," . $csvrow[$mycnt];
|
|
}
|
|
$mycnt = $mycnt + 1;
|
|
}
|
|
print FILE "$myout\n";
|
|
# clear the output
|
|
@csvrow = ();
|
|
}
|
|
print "\n";
|
|
}
|
|
|
|
foreach $starter (@starters) {
|
|
print "STARTER: $starter\n";
|
|
# if we are going to use the dvm, then we
|
|
if ($starter eq "orterun") {
|
|
# need to start it
|
|
if (-e "dvm_uri") {
|
|
system("rm -f dvm_uri");
|
|
}
|
|
$cmd = "orte-dvm --report-uri dvm_uri 2>&1 &";
|
|
if ($myresults) {
|
|
print FILE "\n\n$cmd\n";
|
|
}
|
|
if (!$SHOWME) {
|
|
system($cmd);
|
|
# wait for the rendezvous file to appear
|
|
while (! -e "dvm_uri") {
|
|
sleep(1);
|
|
}
|
|
$havedvm = 1;
|
|
}
|
|
}
|
|
|
|
if ($myresults) {
|
|
print FILE "$starter $starteroptions[$index]\n\n";
|
|
}
|
|
my $testnum = 0;
|
|
foreach $test (@tests) {
|
|
$option = $options[$testnum];
|
|
if (-e $test) {
|
|
if ($myresults) {
|
|
print FILE "#nodes,$test,$option\n";
|
|
}
|
|
if (!$SHOWME) {
|
|
# pre-position the executable
|
|
$cmd = $starter . $starteroptions[$index] . " $test 2>&1";
|
|
system($cmd);
|
|
}
|
|
$n = 1;
|
|
while ($n <= $num_nodes) {
|
|
push @csvrow,$n;
|
|
$cmd = "time " . $starter . " " . $starteroptions[$index] . " $option $test 2>&1";
|
|
print $cmd . "\n";
|
|
if (!$SHOWME) {
|
|
runcmd();
|
|
}
|
|
$n = 2 * $n;
|
|
}
|
|
if (0 != $num_nodes & $n) {
|
|
$cmd = "time " . $starter . " " . $starteroptions[$index] . " $option $test 2>&1";
|
|
print $cmd . "\n";
|
|
if (!$SHOWME) {
|
|
runcmd();
|
|
}
|
|
}
|
|
print "\n--------------------------------------------------\n";
|
|
} else {
|
|
print "Test " . $test . " was not found - test skipped\n";
|
|
print "\n--------------------------------------------------\n";
|
|
}
|
|
$testnum = $testnum + 1;
|
|
}
|
|
if ($havedvm) {
|
|
if (!$SHOWME) {
|
|
$cmd = "orterun --hnp file:dvm_uri --terminate";
|
|
system($cmd);
|
|
}
|
|
if (-e "dvm_uri") {
|
|
system("rm -f dvm_uri");
|
|
}
|
|
}
|
|
$index = $index + 1;
|
|
}
|
|
|
|
if ($myresults) {
|
|
close(FILE);
|
|
}
|