Merge branch 'master' into fix/alpsinfov3
Этот коммит содержится в:
Коммит
3119bc14b2
23
contrib/dist/linux/openmpi.spec
поставляемый
23
contrib/dist/linux/openmpi.spec
поставляемый
@ -12,6 +12,8 @@
|
||||
# Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2013 Mellanox Technologies, Inc.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2015 Research Organization for Information Science
|
||||
# and Technology (RIST). All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -670,7 +672,14 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
|
||||
|
||||
%files
|
||||
%defattr(-, root, root, -)
|
||||
%if %(test "%{_prefix}" = "/usr" && echo 1 || echo 0)
|
||||
%{_bindir}/*
|
||||
%{_includedir}/*
|
||||
%{_libdir}/*
|
||||
%{_datadir}
|
||||
%else
|
||||
%{_prefix}
|
||||
%endif
|
||||
# If the sysconfdir is not under the prefix, then list it explicitly.
|
||||
%if !%{sysconfdir_in_prefix}
|
||||
%{_sysconfdir}
|
||||
@ -706,7 +715,13 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
|
||||
|
||||
%files runtime -f runtime.files
|
||||
%defattr(-, root, root, -)
|
||||
%dir %{_prefix}
|
||||
%if %(test "%{_prefix}" = "/usr" && echo 1 || echo 0)
|
||||
%{_bindir}/*
|
||||
%{_libdir}/*
|
||||
%{_datadir}
|
||||
%else
|
||||
%{_prefix}
|
||||
%endif
|
||||
# If the sysconfdir is not under the prefix, then list it explicitly.
|
||||
%if !%{sysconfdir_in_prefix}
|
||||
%{_sysconfdir}
|
||||
@ -729,9 +744,6 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
|
||||
%{shell_scripts_path}/%{shell_scripts_basename}.sh
|
||||
%{shell_scripts_path}/%{shell_scripts_basename}.csh
|
||||
%endif
|
||||
%dir %{_bindir}
|
||||
%dir %{_libdir}
|
||||
%dir %{_libdir}/openmpi
|
||||
%doc README INSTALL LICENSE
|
||||
%{_pkgdatadir}
|
||||
|
||||
@ -756,6 +768,9 @@ test "x$RPM_BUILD_ROOT" != "x" && rm -rf $RPM_BUILD_ROOT
|
||||
#
|
||||
#############################################################################
|
||||
%changelog
|
||||
* Thu Nov 12 2015 Gilles Gouaillardet <gilles@rist.or.jp>
|
||||
- Revamp packaging when prefix is /usr
|
||||
|
||||
* Tue Jan 20 2015 Bert Wesarg <bert.wesarg@tu-dresden.de>
|
||||
- Remove VampirTrace wrapper from package.
|
||||
|
||||
|
@ -14,14 +14,21 @@ my $reps = 1;
|
||||
my $usedvm = 0;
|
||||
my $usesrun = 0;
|
||||
my $usempirun = 0;
|
||||
my $useaprun = 0;
|
||||
my $useaprun = 0;
|
||||
my $myapp;
|
||||
my $runall = 0;
|
||||
my $rawoutput = 0;
|
||||
my $myresults;
|
||||
my @csvrow;
|
||||
|
||||
my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op);
|
||||
my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1");
|
||||
my @starters = qw(mpirun orte-submit srun orterun);
|
||||
my @starters = qw(mpirun orte-submit srun aprun orterun);
|
||||
my @starteroptions = ("-npernode 1 --novm",
|
||||
"--hnp file:dvm_uri -pernode",
|
||||
"--distribution=cyclic",
|
||||
"-N 1",
|
||||
"-npernode 1 --novm");
|
||||
|
||||
# Set to true if the script should merely print the cmds
|
||||
@ -39,8 +46,12 @@ GetOptions(
|
||||
"reps=s" => \$reps,
|
||||
"dvm" => \$usedvm,
|
||||
"srun" => \$usesrun,
|
||||
"aprun" => \$useaprun,
|
||||
"mpirun" => \$usempirun,
|
||||
"myapp=s" => \$myapp,
|
||||
"all" => \$runall,
|
||||
"results=s" => \$myresults,
|
||||
"rawout" => \$rawoutput,
|
||||
) or die "unable to parse options, stopped";
|
||||
|
||||
if ($HELP) {
|
||||
@ -50,11 +61,15 @@ $0 [options]
|
||||
--help | -h This help message
|
||||
--quiet | -q Only output critical messages to stdout
|
||||
--showme Show the actual commands without executing them
|
||||
--reps Number of times to run each test (for statistics)
|
||||
--reps=s Number of times to run each test (for statistics)
|
||||
--mpirun Use only mpirun (or its equivalent orterun)
|
||||
--dvm Use only orte-dvm to execute the test
|
||||
--srun Use only srun to execute the test
|
||||
--srun Use only srun (if available) to execute the test
|
||||
--arpun Use only aprun (if available) to execute the test
|
||||
--myapp=s In addition to the standard tests, run this specific application (including any args)
|
||||
--all Use all available start commands [default]
|
||||
--results=file File where results are to stored in comma-separated value format
|
||||
--rawout Provide raw timing output to the file
|
||||
EOT
|
||||
exit(0);
|
||||
}
|
||||
@ -68,7 +83,6 @@ my @lines;
|
||||
my $line;
|
||||
my @results;
|
||||
my $res;
|
||||
my $toggle;
|
||||
my $idx;
|
||||
my $option;
|
||||
my $havedvm = 0;
|
||||
@ -104,6 +118,12 @@ while ($idx <= $#starters) {
|
||||
splice @starteroptions, $idx, 1;
|
||||
# adjust the index
|
||||
$idx = $idx - 1;
|
||||
} elsif ($useaprun && $starter ne "aprun") {
|
||||
# remove this one from the list
|
||||
splice @starters, $idx, 1;
|
||||
splice @starteroptions, $idx, 1;
|
||||
# adjust the index
|
||||
$idx = $idx - 1;
|
||||
} elsif ($usempirun && (($starter ne "mpirun") && ($starter ne "orterun"))) {
|
||||
# remove this one from the list
|
||||
splice @starters, $idx, 1;
|
||||
@ -141,13 +161,142 @@ if (scalar @starters == 0) {
|
||||
exit;
|
||||
}
|
||||
|
||||
# if we are going to use the dvm, then we
|
||||
# need to start it
|
||||
if (-e "dvm_uri") {
|
||||
system("rm -f dvm_uri");
|
||||
# if they gave us an app, add it to the list of tests
|
||||
if ($myapp) {
|
||||
push @tests, $myapp;
|
||||
}
|
||||
|
||||
if ($myresults) {
|
||||
# open the results file
|
||||
open FILE, ">$myresults" || die "file could not be opened";
|
||||
}
|
||||
|
||||
# determine the number of nodes - doesn't
|
||||
# matter which starter we use
|
||||
$cmd = $starters[0] . " " . $starteroptions[0] . " hostname";
|
||||
print "CMD: $cmd\n";
|
||||
$output = `$cmd`;
|
||||
print "$output\n";
|
||||
@lines = split(/\n/, $output);
|
||||
$num_nodes = $#lines + 1;
|
||||
|
||||
# collect the complete list of starters
|
||||
my $mystarters;
|
||||
$idx=1;
|
||||
$mystarters = $starters[0];
|
||||
while ($idx < $#starters) {
|
||||
$mystarters = $mystarters . "," . $starters[$idx];
|
||||
$idx = $idx + 1;
|
||||
}
|
||||
|
||||
# get the local date and time
|
||||
my ($sec,$min,$hour,$day,$month,$yr19,@rest) = localtime(time);
|
||||
|
||||
# start by printing out the resulting configuration
|
||||
print "\n--------------------------------------------------\n";
|
||||
print "\nTest configuration:\n";
|
||||
print "\tDate:\t" . "$day-".++$month. "-".($yr19+1900) . " " . sprintf("%02d",$hour).":".sprintf("%02d",$min).":".sprintf("%02d",$sec) . "\n";;
|
||||
print "\tNum nodes:\t" . $num_nodes . "\n";
|
||||
print "\tStarters:\t" . $mystarters . "\n";
|
||||
print "\n--------------------------------------------------\n";
|
||||
|
||||
# and tag the output file as well
|
||||
if ($myresults) {
|
||||
print FILE "Test configuration:\n";
|
||||
print FILE "Date:\t" . "$day-".++$month. "-".($yr19+1900) . " " . sprintf("%02d",$hour).":".sprintf("%02d",$min).":".sprintf("%02d",$sec) . "\n";;
|
||||
print FILE "Num nodes:\t" . $num_nodes . "\n";
|
||||
print FILE "Starters:\t" . $mystarters . "\n";
|
||||
}
|
||||
|
||||
my $index = 0;
|
||||
|
||||
sub runcmd()
|
||||
{
|
||||
for (1..$reps) {
|
||||
$output = `$cmd`;
|
||||
if ($myresults && $rawoutput) {
|
||||
print FILE $n . " " . $output . "\n";
|
||||
}
|
||||
@lines = split(/\n/, $output);
|
||||
foreach $line (@lines) {
|
||||
if (0 <= index($line, "real") ||
|
||||
0 <= index($line, "elapsed")) {
|
||||
# we know that at least one item of interest is
|
||||
# in this line, so let's look for it - start
|
||||
# by getting rid of any leading whitespace
|
||||
$line =~ s/^\s+//;
|
||||
@results = split (/ +/,$line);
|
||||
$idx = 0;
|
||||
foreach $res (@results) {
|
||||
# we are only interested in the real or elapsed time
|
||||
my $strloc = index($res, "real");
|
||||
if (0 <= $strloc) {
|
||||
# some systems put the number in front of
|
||||
# this word, and some append the word to
|
||||
# the number - consider both cases
|
||||
if (0 == $strloc) {
|
||||
if (0 == $idx) {
|
||||
# it must be in the next location
|
||||
push @csvrow,$results[1];
|
||||
} else {
|
||||
# it must be in the prior location
|
||||
push @csvrow,$results[$idx-1];
|
||||
}
|
||||
} else {
|
||||
# take the portion of the string up to the tag
|
||||
push @csvrow,substr($res, 0, $strloc);
|
||||
}
|
||||
} else {
|
||||
$strloc = index($res, "elapsed");
|
||||
if (0 <= $strloc) {
|
||||
# some systems put the number in front of
|
||||
# this word, and some append the word to
|
||||
# the number - consider both cases
|
||||
if (0 == $strloc) {
|
||||
if (0 == $idx) {
|
||||
# it must be in the next location
|
||||
push @csvrow,$results[1];
|
||||
} else {
|
||||
# it must be in the prior location
|
||||
push @csvrow,$results[$idx-1];
|
||||
}
|
||||
} else {
|
||||
# take the portion of the string up to the tag
|
||||
push @csvrow,substr($res, 0, $strloc);
|
||||
}
|
||||
}
|
||||
}
|
||||
$idx = $idx + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
# we have now completed all the reps, so log the results
|
||||
if ($myresults) {
|
||||
my $myout;
|
||||
my $mycnt=0;
|
||||
while ($mycnt <= $#csvrow) {
|
||||
if (0 == $mycnt) {
|
||||
$myout = $csvrow[$mycnt];
|
||||
} else {
|
||||
$myout = $myout . "," . $csvrow[$mycnt];
|
||||
}
|
||||
$mycnt = $mycnt + 1;
|
||||
}
|
||||
print FILE "$myout\n";
|
||||
# clear the output
|
||||
@csvrow = ();
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
foreach $starter (@starters) {
|
||||
# if we are going to use the dvm, then we
|
||||
if ($starter eq "orte-submit") {
|
||||
# need to start it
|
||||
if (-e "dvm_uri") {
|
||||
system("rm -f dvm_uri");
|
||||
}
|
||||
$cmd = "orte-dvm --report-uri dvm_uri 2>&1 &";
|
||||
print $cmd . "\n";
|
||||
if (!$SHOWME) {
|
||||
@ -159,24 +308,17 @@ foreach $starter (@starters) {
|
||||
$havedvm = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# determine the number of nodes - doesn't
|
||||
# matter which starter we use
|
||||
$cmd = $starters[0] . " " . $starteroptions[0] . " hostname";
|
||||
$output = `$cmd`;
|
||||
@lines = split(/\n/, $output);
|
||||
$num_nodes = $#lines + 1;
|
||||
|
||||
|
||||
print "\n--------------------------------------------------\n";
|
||||
|
||||
my $index = 0;
|
||||
foreach $starter (@starters) {
|
||||
if ($myresults) {
|
||||
print FILE "\n\n$starter\n\n";
|
||||
}
|
||||
my $testnum = 0;
|
||||
foreach $test (@tests) {
|
||||
$option = $options[$testnum];
|
||||
if (-e $test) {
|
||||
if ($myresults) {
|
||||
print FILE "#nodes,$test\n";
|
||||
}
|
||||
if (!$SHOWME) {
|
||||
# pre-position the executable
|
||||
$cmd = $starter . $starteroptions[$index] . " $test 2>&1";
|
||||
@ -184,52 +326,19 @@ foreach $starter (@starters) {
|
||||
}
|
||||
$n = 1;
|
||||
while ($n <= $num_nodes) {
|
||||
$cmd = "time " . $starter . " " . $starteroptions[$index] . " -np $n $option $test 2>&1";
|
||||
push @csvrow,$n;
|
||||
$cmd = "time " . $starter . " " . $starteroptions[$index] . " -n $n $option $test 2>&1";
|
||||
print $cmd . "\n";
|
||||
if (!$SHOWME) {
|
||||
for (1..$reps) {
|
||||
$toggle = 1;
|
||||
$output = `$cmd`;
|
||||
print $output . "\n";
|
||||
@lines = split(/\n/, $output);
|
||||
foreach $line (@lines) {
|
||||
if (0 <= index($line, "user") ||
|
||||
0 <= index($line, "sys") ||
|
||||
0 <= index($line, "real") ||
|
||||
0 <= index($line, "elapsed")) {
|
||||
$idx = 0;
|
||||
@results = split(/\s+/,$line, 4);
|
||||
foreach $res (@results) {
|
||||
if ($idx < 3) {
|
||||
print $res;
|
||||
if (0 == $toggle) {
|
||||
print " ";
|
||||
$toggle = 1;
|
||||
} else {
|
||||
print " ";
|
||||
$toggle = 0;
|
||||
}
|
||||
}
|
||||
$idx = $idx + 1;
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
print "\n";
|
||||
runcmd();
|
||||
}
|
||||
$n = 2 * $n;
|
||||
}
|
||||
if ($n < $num_nodes) {
|
||||
if (0 != $num_nodes & $n) {
|
||||
$cmd = "time " . $starter . " " . $starteroptions[$index] . " $option $test 2>&1";
|
||||
print $cmd . "\n";
|
||||
if (!$SHOWME) {
|
||||
for (1..$reps) {
|
||||
$output = `$cmd`;
|
||||
$output =~ s/(.+)\n.*/$1/;
|
||||
@results = split(/\s+/,$output);
|
||||
print $results[0] . " " . $results[1] . " " . $results[2] . "\n";
|
||||
}
|
||||
runcmd();
|
||||
}
|
||||
}
|
||||
print "\n--------------------------------------------------\n";
|
||||
@ -239,15 +348,19 @@ foreach $starter (@starters) {
|
||||
}
|
||||
$testnum = $testnum + 1;
|
||||
}
|
||||
if ($havedvm) {
|
||||
if (!$SHOWME) {
|
||||
$cmd = "orte-submit --hnp file:dvm_uri --terminate";
|
||||
system($cmd);
|
||||
}
|
||||
if (-e "dvm_uri") {
|
||||
system("rm -f dvm_uri");
|
||||
}
|
||||
}
|
||||
$index = $index + 1;
|
||||
}
|
||||
|
||||
if ($havedvm) {
|
||||
if (!$SHOWME) {
|
||||
$cmd = "orte-submit --hnp file:dvm_uri --terminate";
|
||||
system($cmd);
|
||||
}
|
||||
if (-e "dvm_uri") {
|
||||
system("rm -f dvm_uri");
|
||||
}
|
||||
if ($myresults) {
|
||||
close(FILE);
|
||||
}
|
||||
|
||||
|
@ -386,6 +386,7 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ucp_worker_progress(ompi_pml_ucx.ucp_worker);
|
||||
while (!req->req_complete) {
|
||||
opal_progress();
|
||||
}
|
||||
@ -492,10 +493,11 @@ int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, i
|
||||
mca_pml_ucx_get_datatype(datatype),
|
||||
PML_UCX_MAKE_SEND_TAG(tag, comm),
|
||||
mca_pml_ucx_send_completion);
|
||||
if (req == NULL) {
|
||||
if (OPAL_LIKELY(req == NULL)) {
|
||||
return OMPI_SUCCESS;
|
||||
} else if (!UCS_PTR_IS_ERR(req)) {
|
||||
PML_UCX_VERBOSE(8, "got request %p", (void*)req);
|
||||
ucp_worker_progress(ompi_pml_ucx.ucp_worker);
|
||||
ompi_request_wait(&req, MPI_STATUS_IGNORE);
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
@ -698,6 +700,7 @@ int mca_pml_ucx_start(size_t count, ompi_request_t** requests)
|
||||
PML_UCX_VERBOSE(8, "temporary request %p will complete persistent request %p",
|
||||
(void*)tmp_req, (void*)preq);
|
||||
tmp_req->req_complete_cb_data = preq;
|
||||
preq->tmp_req = tmp_req;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||
} else {
|
||||
|
@ -25,6 +25,12 @@ static int mca_pml_ucx_request_free(ompi_request_t **rptr)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_pml_ucx_request_cancel(ompi_request_t *req, int flag)
|
||||
{
|
||||
ucp_request_cancel(ompi_pml_ucx.ucp_worker, req);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_pml_ucx_send_completion(void *request, ucs_status_t status)
|
||||
{
|
||||
ompi_request_t *req = request;
|
||||
@ -55,12 +61,19 @@ void mca_pml_ucx_recv_completion(void *request, ucs_status_t status,
|
||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||
}
|
||||
|
||||
void mca_pml_ucx_persistent_requset_complete(mca_pml_ucx_persistent_request_t *preq,
|
||||
static void mca_pml_ucx_persistent_request_detach(mca_pml_ucx_persistent_request_t *preq,
|
||||
ompi_request_t *tmp_req)
|
||||
{
|
||||
tmp_req->req_complete_cb_data = NULL;
|
||||
preq->tmp_req = NULL;
|
||||
}
|
||||
|
||||
void mca_pml_ucx_persistent_request_complete(mca_pml_ucx_persistent_request_t *preq,
|
||||
ompi_request_t *tmp_req)
|
||||
{
|
||||
preq->ompi.req_status = tmp_req->req_status;
|
||||
ompi_request_complete(&preq->ompi, true);
|
||||
tmp_req->req_complete_cb_data = NULL;
|
||||
mca_pml_ucx_persistent_request_detach(preq, tmp_req);
|
||||
mca_pml_ucx_request_reset(tmp_req);
|
||||
ucp_request_release(tmp_req);
|
||||
}
|
||||
@ -73,7 +86,8 @@ static inline void mca_pml_ucx_preq_completion(ompi_request_t *tmp_req)
|
||||
ompi_request_complete(tmp_req, false);
|
||||
preq = (mca_pml_ucx_persistent_request_t*)tmp_req->req_complete_cb_data;
|
||||
if (preq != NULL) {
|
||||
mca_pml_ucx_persistent_requset_complete(preq, tmp_req);
|
||||
PML_UCX_ASSERT(preq->tmp_req != NULL);
|
||||
mca_pml_ucx_persistent_request_complete(preq, tmp_req);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
||||
}
|
||||
@ -120,7 +134,8 @@ void mca_pml_ucx_request_init(void *request)
|
||||
ompi_request_t* ompi_req = request;
|
||||
OBJ_CONSTRUCT(ompi_req, ompi_request_t);
|
||||
mca_pml_ucx_request_init_common(ompi_req, false, OMPI_REQUEST_ACTIVE,
|
||||
mca_pml_ucx_request_free, NULL);
|
||||
mca_pml_ucx_request_free,
|
||||
mca_pml_ucx_request_cancel);
|
||||
}
|
||||
|
||||
void mca_pml_ucx_request_cleanup(void *request)
|
||||
@ -133,18 +148,35 @@ void mca_pml_ucx_request_cleanup(void *request)
|
||||
|
||||
static int mca_pml_ucx_persistent_request_free(ompi_request_t **rptr)
|
||||
{
|
||||
mca_pml_ucx_persistent_request_t* req = (mca_pml_ucx_persistent_request_t*)*rptr;
|
||||
mca_pml_ucx_persistent_request_t* preq = (mca_pml_ucx_persistent_request_t*)*rptr;
|
||||
ompi_request_t *tmp_req = preq->tmp_req;
|
||||
|
||||
preq->ompi.req_state = OMPI_REQUEST_INVALID;
|
||||
if (tmp_req != NULL) {
|
||||
mca_pml_ucx_persistent_request_detach(preq, tmp_req);
|
||||
ucp_request_release(tmp_req);
|
||||
}
|
||||
PML_UCX_FREELIST_RETURN(&ompi_pml_ucx.persistent_reqs, &preq->ompi.super);
|
||||
*rptr = MPI_REQUEST_NULL;
|
||||
req->ompi.req_state = OMPI_REQUEST_INVALID;
|
||||
PML_UCX_FREELIST_RETURN(&ompi_pml_ucx.persistent_reqs, &req->ompi.super);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_pml_ucx_persistent_request_cancel(ompi_request_t *req, int flag)
|
||||
{
|
||||
mca_pml_ucx_persistent_request_t* preq = (mca_pml_ucx_persistent_request_t*)req;
|
||||
|
||||
if (preq->tmp_req != NULL) {
|
||||
ucp_request_cancel(ompi_pml_ucx.ucp_worker, preq->tmp_req);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_pml_ucx_persisternt_request_construct(mca_pml_ucx_persistent_request_t* req)
|
||||
{
|
||||
mca_pml_ucx_request_init_common(&req->ompi, true, OMPI_REQUEST_INACTIVE,
|
||||
mca_pml_ucx_persistent_request_free, NULL);
|
||||
mca_pml_ucx_persistent_request_free,
|
||||
mca_pml_ucx_persistent_request_cancel);
|
||||
req->tmp_req = NULL;
|
||||
}
|
||||
|
||||
static void mca_pml_ucx_persisternt_request_destruct(mca_pml_ucx_persistent_request_t* req)
|
||||
|
@ -89,6 +89,7 @@ enum {
|
||||
|
||||
struct pml_ucx_persistent_request {
|
||||
ompi_request_t ompi;
|
||||
ompi_request_t *tmp_req;
|
||||
unsigned flags;
|
||||
void *buffer;
|
||||
size_t count;
|
||||
|
@ -278,6 +278,13 @@ mtu = 2048
|
||||
receive_queues = P,65536,256,192,128
|
||||
max_inline_data = 64
|
||||
|
||||
[Intel HFI1]
|
||||
vendor_id = 0x1175
|
||||
vendor_part_id = 9456,9457
|
||||
use_eager_rdma = 1
|
||||
mtu = 4096
|
||||
max_inline_data = 0
|
||||
|
||||
############################################################################
|
||||
|
||||
# Intel has several OUI's, including 0x8086. Amusing. :-) Intel has
|
||||
|
@ -311,6 +311,11 @@ static int create_dmns(orte_grpcomm_signature_t *sig,
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* get the array */
|
||||
if (0 == jdata->map->num_nodes) {
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
dns = (orte_vpid_t*)malloc(jdata->map->num_nodes * sizeof(vpid));
|
||||
nds = 0;
|
||||
for (i=0; i < jdata->map->nodes->size && (int)nds < jdata->map->num_nodes; i++) {
|
||||
|
@ -1652,6 +1652,7 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
|
||||
"%s plm:base:setup_vm only HNP in use",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
OBJ_DESTRUCT(&nodes);
|
||||
map->num_nodes = 1;
|
||||
/* mark that the daemons have reported so we can proceed */
|
||||
daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -61,7 +61,8 @@ typedef uint32_t orte_proc_type_t;
|
||||
#define ORTE_PROC_DVM 0x0102 // DVM + daemon
|
||||
#define ORTE_PROC_IOF_ENDPT 0x1000
|
||||
#define ORTE_PROC_SCHEDULER 0x2000
|
||||
#define ORTE_PROC_MASTER 0x4004 // Master + HNP
|
||||
#define ORTE_PROC_MASTER_ACTUAL 0x4000
|
||||
#define ORTE_PROC_MASTER (ORTE_PROC_MASTER_ACTUAL + ORTE_PROC_HNP)
|
||||
|
||||
#define ORTE_PROC_IS_SINGLETON (ORTE_PROC_SINGLETON & orte_process_info.proc_type)
|
||||
#define ORTE_PROC_IS_DAEMON (ORTE_PROC_DAEMON & orte_process_info.proc_type)
|
||||
@ -75,7 +76,7 @@ typedef uint32_t orte_proc_type_t;
|
||||
#define ORTE_PROC_IS_DVM (ORTE_PROC_DVM & orte_process_info.proc_type)
|
||||
#define ORTE_PROC_IS_IOF_ENDPT (ORTE_PROC_IOF_ENDPT & orte_process_info.proc_type)
|
||||
#define ORTE_PROC_IS_SCHEDULER (ORTE_PROC_SCHEDULER & orte_process_info.proc_type)
|
||||
#define ORTE_PROC_IS_MASTER (0x4000 & orte_process_info.proc_type)
|
||||
#define ORTE_PROC_IS_MASTER (ORTE_PROC_MASTER_ACTUAL & orte_process_info.proc_type)
|
||||
|
||||
|
||||
/**
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
#include "oshmem/mca/atomic/atomic.h"
|
||||
#include "oshmem/mca/atomic/base/base.h"
|
||||
|
||||
@ -73,6 +74,8 @@ static int mca_atomic_base_close(void)
|
||||
|
||||
static int mca_atomic_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
oshmem_framework_open_output(&oshmem_atomic_base_framework);
|
||||
|
||||
/* Open up all available components */
|
||||
if (OPAL_SUCCESS !=
|
||||
mca_base_framework_components_open(&oshmem_atomic_base_framework, flags)) {
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
#include "oshmem/mca/memheap/memheap.h"
|
||||
#include "oshmem/mca/memheap/base/base.h"
|
||||
|
||||
@ -41,7 +42,6 @@ mca_memheap_map_t mca_memheap_base_map = {{{0}}};
|
||||
|
||||
static int mca_memheap_base_register(mca_base_register_flag_t flags)
|
||||
{
|
||||
|
||||
(void) mca_base_var_register("oshmem",
|
||||
"memheap",
|
||||
"base",
|
||||
@ -124,6 +124,8 @@ static int mca_memheap_base_open(mca_base_open_flag_t flags)
|
||||
mca_memheap_base_map.n_segments = 0;
|
||||
mca_memheap_base_map.num_transports = 0;
|
||||
|
||||
oshmem_framework_open_output(&oshmem_memheap_base_framework);
|
||||
|
||||
/* Open up all available components */
|
||||
if (OPAL_SUCCESS !=
|
||||
mca_base_framework_components_open(&oshmem_memheap_base_framework, flags)) {
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
#include "oshmem/mca/scoll/scoll.h"
|
||||
#include "oshmem/mca/scoll/base/base.h"
|
||||
|
||||
@ -100,6 +101,8 @@ static int mca_scoll_base_close(void)
|
||||
|
||||
static int mca_scoll_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
oshmem_framework_open_output(&oshmem_scoll_base_framework);
|
||||
|
||||
/* Open up all available components */
|
||||
if (OPAL_SUCCESS !=
|
||||
mca_base_framework_components_open(&oshmem_scoll_base_framework, flags)) {
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
#include "oshmem/constants.h"
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
#include "oshmem/mca/spml/spml.h"
|
||||
#include "oshmem/mca/spml/base/base.h"
|
||||
#include "oshmem/mca/spml/base/spml_base_request.h"
|
||||
@ -108,6 +109,8 @@ static int mca_spml_base_open(mca_base_open_flag_t flags)
|
||||
|
||||
OBJ_CONSTRUCT(&mca_spml_base_spml, opal_pointer_array_t);
|
||||
|
||||
oshmem_framework_open_output(&oshmem_spml_base_framework);
|
||||
|
||||
/* Open up all available components */
|
||||
if (OPAL_SUCCESS !=
|
||||
mca_base_framework_components_open(&oshmem_spml_base_framework, flags)) {
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "oshmem_config.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/mca/memchecker/base/base.h"
|
||||
#include "orte/include/orte/types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "oshmem/mca/spml/ikrit/spml_ikrit.h"
|
||||
@ -41,6 +42,19 @@
|
||||
#define SPML_IKRIT_PUT_DEBUG 0
|
||||
#endif
|
||||
|
||||
#define SPML_IKRIT_MXM_POST_SEND(sreq) \
|
||||
do { \
|
||||
mxm_error_t err; \
|
||||
err = mxm_req_send(&sreq); \
|
||||
if (MXM_OK != err) { \
|
||||
SPML_ERROR("mxm_req_send (op=%d) failed: %s - aborting", \
|
||||
sreq.opcode, \
|
||||
mxm_error_string(err)); \
|
||||
oshmem_shmem_abort(-1); \
|
||||
return OSHMEM_ERROR; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
typedef struct spml_ikrit_am_hdr {
|
||||
uint64_t va;
|
||||
} spml_ikrit_am_hdr_t;
|
||||
@ -88,6 +102,7 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) {
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req)
|
||||
{
|
||||
while (!mxm_req_test(req))
|
||||
@ -99,11 +114,12 @@ static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request)
|
||||
mca_spml_ikrit_put_request_t *put_req =
|
||||
*(mca_spml_ikrit_put_request_t **) request;
|
||||
|
||||
assert(false == put_req->req_put.req_base.req_free_called);
|
||||
OPAL_THREAD_LOCK(&oshmem_request_lock);
|
||||
assert(false == put_req->req_put.req_base.req_free_called);
|
||||
put_req->req_put.req_base.req_free_called = true;
|
||||
opal_free_list_return (&mca_spml_base_put_requests,
|
||||
(opal_free_list_item_t*)put_req);
|
||||
opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req));
|
||||
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
|
||||
|
||||
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
|
||||
@ -147,11 +163,12 @@ static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request)
|
||||
mca_spml_ikrit_get_request_t *get_req =
|
||||
*(mca_spml_ikrit_get_request_t **) request;
|
||||
|
||||
assert(false == get_req->req_get.req_base.req_free_called);
|
||||
OPAL_THREAD_LOCK(&oshmem_request_lock);
|
||||
assert(false == get_req->req_get.req_base.req_free_called);
|
||||
get_req->req_get.req_base.req_free_called = true;
|
||||
opal_free_list_return (&mca_spml_base_get_requests,
|
||||
(opal_free_list_item_t*)get_req);
|
||||
opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req));
|
||||
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
|
||||
|
||||
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
|
||||
@ -167,7 +184,7 @@ static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request,
|
||||
|
||||
static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req)
|
||||
{
|
||||
req->req_get.req_base.req_type = MCA_SPML_REQUEST_PUT;
|
||||
req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET;
|
||||
req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free;
|
||||
req->req_get.req_base.req_oshmem.req_cancel =
|
||||
mca_spml_ikrit_get_request_cancel;
|
||||
@ -245,6 +262,10 @@ static inline mca_spml_ikrit_put_request_t *alloc_put_req(void)
|
||||
item = opal_free_list_wait (&mca_spml_base_put_requests);
|
||||
|
||||
req = (mca_spml_ikrit_put_request_t *) item;
|
||||
opal_memchecker_base_mem_undefined(req, sizeof(*req));
|
||||
opal_memchecker_base_mem_defined(&req->req_put.req_base,
|
||||
sizeof(req->req_put.req_base));
|
||||
|
||||
req->req_put.req_base.req_free_called = false;
|
||||
req->req_put.req_base.req_oshmem.req_complete = false;
|
||||
|
||||
@ -259,6 +280,10 @@ static inline mca_spml_ikrit_get_request_t *alloc_get_req(void)
|
||||
item = opal_free_list_wait (&mca_spml_base_get_requests);
|
||||
|
||||
req = (mca_spml_ikrit_get_request_t *) item;
|
||||
opal_memchecker_base_mem_undefined(req, sizeof(*req));
|
||||
opal_memchecker_base_mem_defined(&req->req_get.req_base,
|
||||
sizeof(req->req_get.req_base));
|
||||
|
||||
req->req_get.req_base.req_free_called = false;
|
||||
req->req_get.req_base.req_oshmem.req_complete = false;
|
||||
|
||||
@ -363,19 +388,15 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
|
||||
for (n = 0; n < nprocs; n++) {
|
||||
i = (my_rank + n) % nprocs;
|
||||
if (mca_spml_ikrit.mxm_peers[i]->mxm_conn) {
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
}
|
||||
if (mca_spml_ikrit.hw_rdma_channel && mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn) {
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
if (mca_spml_ikrit.hw_rdma_channel) {
|
||||
assert(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn);
|
||||
}
|
||||
destroy_ptl_idx(i);
|
||||
if (mca_spml_ikrit.mxm_peers[i]) {
|
||||
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
if (mca_spml_ikrit.mxm_peers)
|
||||
free(mca_spml_ikrit.mxm_peers);
|
||||
free(mca_spml_ikrit.mxm_peers);
|
||||
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
@ -407,20 +428,18 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
}
|
||||
memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t));
|
||||
#endif
|
||||
ep_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
|
||||
if (NULL == ep_info) {
|
||||
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
goto bail;
|
||||
}
|
||||
memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
|
||||
if (mca_spml_ikrit.hw_rdma_channel) {
|
||||
ep_hw_rdma_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
|
||||
if (NULL == ep_hw_rdma_info) {
|
||||
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
goto bail;
|
||||
}
|
||||
memset(ep_hw_rdma_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
}
|
||||
|
||||
mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs
|
||||
@ -529,8 +548,10 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
/* Save returned connections */
|
||||
for (i = 0; i < nprocs; ++i) {
|
||||
mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn;
|
||||
if (OSHMEM_SUCCESS != create_ptl_idx(i))
|
||||
if (OSHMEM_SUCCESS != create_ptl_idx(i)) {
|
||||
rc = OSHMEM_ERR_CONNECTION_FAILED;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
@ -559,7 +580,7 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
continue;
|
||||
}
|
||||
if (procs[i] == proc_self)
|
||||
continue;
|
||||
continue;
|
||||
|
||||
/* use zcopy for put/get via sysv shared memory */
|
||||
procs[i]->transport_ids[0] = MXM_PTL_SHM;
|
||||
@ -629,7 +650,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr,
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
mkeys[i].len = 0;
|
||||
#else
|
||||
if (mca_spml_ikrit.ud_only && !mca_spml_ikrit.hw_rdma_channel) {
|
||||
if (mca_spml_ikrit.ud_only) {
|
||||
mkeys[i].len = 0;
|
||||
break;
|
||||
}
|
||||
@ -848,6 +869,7 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr,
|
||||
int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
|
||||
{
|
||||
mxm_send_req_t sreq;
|
||||
mxm_error_t err;
|
||||
|
||||
if (0 >= size) {
|
||||
return OSHMEM_SUCCESS;
|
||||
@ -873,10 +895,9 @@ int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
|
||||
#endif
|
||||
sreq.base.completed_cb = NULL;
|
||||
|
||||
mxm_req_send(&sreq);
|
||||
opal_progress();
|
||||
mca_spml_irkit_req_wait(&sreq.base);
|
||||
SPML_IKRIT_MXM_POST_SEND(sreq);
|
||||
|
||||
mca_spml_irkit_req_wait(&sreq.base);
|
||||
if (MXM_OK != sreq.base.error) {
|
||||
SPML_ERROR("get request failed: %s - aborting",
|
||||
mxm_error_string(sreq.base.error));
|
||||
@ -935,14 +956,8 @@ int mca_spml_ikrit_get_async(void *src_addr,
|
||||
get_req->mxm_req.base.context = get_req;
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1);
|
||||
|
||||
mxm_req_send(&get_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);
|
||||
|
||||
if (MXM_OK != get_req->mxm_req.base.error) {
|
||||
SPML_ERROR("get request failed: %s - aborting",
|
||||
mxm_error_string(get_req->mxm_req.base.error));
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
@ -989,7 +1004,7 @@ static int mca_spml_ikrit_mxm_fence(int dst)
|
||||
fence_req->mxm_req.base.context = fence_req;
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1);
|
||||
|
||||
mxm_req_send(&fence_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1162,15 +1177,8 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr,
|
||||
|
||||
mca_spml_ikrit.mxm_peers[dst]->n_active_puts++;
|
||||
|
||||
mxm_req_send(&put_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);
|
||||
|
||||
if (MXM_OK != put_req->mxm_req.base.error) {
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1);
|
||||
SPML_ERROR("put request %p failed: %s - aborting",
|
||||
(void*)put_req, mxm_error_string(put_req->mxm_req.base.error));
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
if (need_progress)
|
||||
mxm_progress(mca_spml_ikrit.mxm_context);
|
||||
|
||||
@ -1269,13 +1277,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr,
|
||||
mca_spml_ikrit.mxm_peers[dst]->need_fence = 1;
|
||||
}
|
||||
|
||||
mxm_req_send(&mxm_req);
|
||||
if (MXM_OK != mxm_req.base.error) {
|
||||
SPML_ERROR("put request failed: %s(%d) - aborting",
|
||||
mxm_error_string(mxm_req.base.error), mxm_req.base.error);
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
SPML_IKRIT_MXM_POST_SEND(mxm_req);
|
||||
|
||||
wait.req = &mxm_req.base;
|
||||
wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
|
||||
@ -1432,7 +1434,8 @@ int mca_spml_ikrit_send(void* buf,
|
||||
req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
|
||||
req.base.data.buffer.memh = NULL;
|
||||
|
||||
mxm_req_send(&req);
|
||||
SPML_IKRIT_MXM_POST_SEND(req);
|
||||
|
||||
mca_spml_irkit_req_wait(&req.base);
|
||||
if (req.base.error != MXM_OK) {
|
||||
return OSHMEM_ERROR;
|
||||
|
@ -106,11 +106,11 @@ static inline int set_mxm_tls()
|
||||
|
||||
tls = getenv("MXM_TLS");
|
||||
if (NULL == tls) {
|
||||
setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1);
|
||||
return OSHMEM_SUCCESS;
|
||||
opal_setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1, &environ);
|
||||
return check_mxm_tls("MXM_OSHMEM_TLS");
|
||||
}
|
||||
if (OSHMEM_SUCCESS == check_mxm_tls("MXM_TLS")) {
|
||||
setenv("MXM_OSHMEM_TLS", tls, 1);
|
||||
opal_setenv("MXM_OSHMEM_TLS", tls, 1, &environ);
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
return OSHMEM_ERROR;
|
||||
@ -120,12 +120,14 @@ static inline int check_mxm_hw_tls(char *v, char *tls)
|
||||
{
|
||||
if (v && tls) {
|
||||
if ((0 == strcmp(tls, "rc") || 0 == strcmp(tls, "dc"))) {
|
||||
mca_spml_ikrit.ud_only = 0;
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
if (strstr(tls, "ud") &&
|
||||
(NULL == strstr(tls, "rc") && NULL == strstr(tls, "dc") &&
|
||||
NULL == strstr(tls, "shm"))) {
|
||||
mca_spml_ikrit.ud_only = 1;
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -140,8 +142,10 @@ static inline int set_mxm_hw_rdma_tls()
|
||||
if (!mca_spml_ikrit.hw_rdma_channel) {
|
||||
return check_mxm_hw_tls("MXM_OSHMEM_TLS", getenv("MXM_OSHMEM_TLS"));
|
||||
}
|
||||
setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0);
|
||||
setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0);
|
||||
opal_setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0, &environ);
|
||||
opal_setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0, &environ);
|
||||
SPML_VERBOSE(5, "Additional communication channel is enabled. Transports are: %s",
|
||||
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
|
||||
|
||||
return check_mxm_hw_tls("MXM_OSHMEM_HW_RDMA_TLS",
|
||||
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
|
||||
@ -295,6 +299,8 @@ static int mca_spml_ikrit_component_open(void)
|
||||
mca_spml_ikrit.ud_only = 1;
|
||||
mca_spml_ikrit.mxm_ctx_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA));
|
||||
#endif
|
||||
SPML_VERBOSE(5, "UD only mode is %s",
|
||||
mca_spml_ikrit.ud_only ? "enabled" : "disabled");
|
||||
|
||||
err = mxm_init(mca_spml_ikrit.mxm_ctx_opts, &mca_spml_ikrit.mxm_context);
|
||||
if (MXM_OK != err) {
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
#include "oshmem/mca/sshmem/sshmem.h"
|
||||
#include "oshmem/mca/sshmem/base/base.h"
|
||||
|
||||
@ -86,6 +87,8 @@ mca_sshmem_base_register (mca_base_register_flag_t flags)
|
||||
|
||||
static int mca_sshmem_base_open(mca_base_open_flag_t flags)
|
||||
{
|
||||
oshmem_framework_open_output(&oshmem_sshmem_base_framework);
|
||||
|
||||
/* Open up all available components */
|
||||
if (OPAL_SUCCESS !=
|
||||
mca_base_framework_components_open(&oshmem_sshmem_base_framework, flags)) {
|
||||
|
@ -71,11 +71,11 @@ int oshmem_shmem_abort(int errcode)
|
||||
|
||||
/* Should we print a stack trace? Not aggregated because they
|
||||
might be different on all processes. */
|
||||
if (ompi_mpi_abort_print_stack) {
|
||||
if (oshmem_shmem_abort_print_stack) {
|
||||
char **messages;
|
||||
int len, i;
|
||||
|
||||
if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr,
|
||||
"[%s:%d] [%d] func:%s\n",
|
||||
@ -94,6 +94,24 @@ int oshmem_shmem_abort(int errcode)
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we wait for a while before aborting? */
|
||||
if (0 != oshmem_shmem_abort_delay) {
|
||||
if (oshmem_shmem_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
host, (int) pid, oshmem_shmem_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--oshmem_shmem_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (!orte_initialized || !oshmem_shmem_initialized) {
|
||||
if (orte_show_help_is_available()) {
|
||||
/* TODO help message from SHMEM not from MPI is needed*/
|
||||
|
@ -13,12 +13,37 @@
|
||||
#include "oshmem/constants.h"
|
||||
|
||||
|
||||
bool oshmem_shmem_abort_print_stack = false;
|
||||
int oshmem_shmem_abort_delay = 0;
|
||||
int oshmem_shmem_lock_recursive = 0;
|
||||
int oshmem_shmem_api_verbose = 0;
|
||||
int oshmem_preconnect_all = 0;
|
||||
|
||||
int oshmem_shmem_register_params(void)
|
||||
{
|
||||
oshmem_shmem_abort_delay = 0;
|
||||
(void) mca_base_var_register("oshmem",
|
||||
"oshmem",
|
||||
NULL,
|
||||
"abort_delay",
|
||||
"If nonzero, print out an identifying message when abort is invoked (hostname, PID of the process that called abort operation) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&oshmem_shmem_abort_delay);
|
||||
|
||||
oshmem_shmem_abort_print_stack = false;
|
||||
(void) mca_base_var_register("oshmem",
|
||||
"oshmem",
|
||||
NULL,
|
||||
"abort_print_stack",
|
||||
"If nonzero, print out a stack trace when abort is invoked",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||
0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&oshmem_shmem_abort_print_stack);
|
||||
|
||||
(void) mca_base_var_register("oshmem",
|
||||
"oshmem",
|
||||
NULL,
|
||||
|
@ -20,9 +20,20 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
|
||||
/**
|
||||
* Whether an MPI_ABORT should print out a stack trace or not.
|
||||
* Whether an abort should print out a stack trace or not.
|
||||
*/
|
||||
OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack;
|
||||
OSHMEM_DECLSPEC extern bool oshmem_shmem_abort_print_stack;
|
||||
|
||||
/**
|
||||
* Whether abort should print out an identifying message
|
||||
* (e.g., hostname and PID) and loop waiting for a debugger to
|
||||
* attach. The value of the integer is how many seconds to wait:
|
||||
*
|
||||
* 0 = do not print the message and do not loop
|
||||
* negative value = print the message and loop forever
|
||||
* positive value = print the message and delay for that many seconds
|
||||
*/
|
||||
OSHMEM_DECLSPEC extern int oshmem_shmem_abort_delay;
|
||||
|
||||
/**
|
||||
* Whether or not the lock routines are recursive
|
||||
|
@ -13,8 +13,6 @@
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "oshmem/constants.h"
|
||||
#include "oshmem/util/oshmem_util.h"
|
||||
|
||||
@ -45,6 +43,7 @@ void oshmem_output_verbose(int level, int output_id, const char* prefix,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void oshmem_output(int output_id, const char* prefix, const char* file,
|
||||
int line, const char* function, const char* format, ...)
|
||||
{
|
||||
|
@ -13,6 +13,10 @@
|
||||
|
||||
#include "oshmem_config.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
|
||||
/*
|
||||
* Environment variables
|
||||
*/
|
||||
@ -32,4 +36,18 @@ void oshmem_output_verbose(int level, int output_id, const char* prefix,
|
||||
void oshmem_output(int output_id, const char* prefix, const char* file,
|
||||
int line, const char* function, const char* format, ...);
|
||||
|
||||
|
||||
/* Force opening output for framework
|
||||
* We would like to display error messages in any case (debug/release mode,
|
||||
* set/unset verbose level)
|
||||
* Setting verbose level is not a way because it enables non error messages
|
||||
*/
|
||||
static inline void oshmem_framework_open_output(struct mca_base_framework_t *framework)
|
||||
{
|
||||
if (-1 == framework->framework_output) {
|
||||
framework->framework_output = opal_output_open(NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif /* OSHMEM_UTIL_H */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user