Convert a few opal_output() calls to instead use orte_show_help() as well as do some minor cosmetic changes dealing with tab spacing and c-blocks being enclosed with \{\}. There was also a long standing bug with the PSM mtl if the number of hardware contexts on adapter were less than the number of cores on a node (The default case is they are the same hence no issues were reported). For completeness we take care of this case as well but it requires us to tell PSM how many local processes are running on a node and the local rank of the process on a node so it can allocate the available hardware contexts appropriately.
This commit was SVN r21745.
Этот коммит содержится в:
родитель
cb653bc4e8
Коммит
af09e7678c
@ -20,6 +20,8 @@ EXTRA_DIST = post_configure.sh
|
||||
|
||||
AM_CPPFLAGS = $(mtl_psm_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA = help-mtl-psm.txt
|
||||
|
||||
mtl_psm_sources = \
|
||||
mtl_psm.c \
|
||||
mtl_psm.h \
|
||||
|
40
ompi/mca/mtl/psm/help-mtl-psm.txt
Обычный файл
40
ompi/mca/mtl/psm/help-mtl-psm.txt
Обычный файл
@ -0,0 +1,40 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (C) 2009. QLogic Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
[psm init]
|
||||
Initialization of PSM library failed.
|
||||
|
||||
Error: %s
|
||||
#
|
||||
[debug level]
|
||||
Unable to set PSM debug level.
|
||||
|
||||
Error: %s
|
||||
#
|
||||
[unable to open endpoint]
|
||||
PSM was unable to open an endpoint. Please make sure that the network link is
|
||||
active on the node and the hardware is functioning.
|
||||
|
||||
Error: %s
|
||||
#
|
||||
[no uuid present]
|
||||
Error obtaining unique transport key from ORTE (orte_precondition_transports %s
|
||||
the environment).
|
||||
|
||||
Local host: %s
|
||||
#
|
||||
[error polling network]
|
||||
Error %s occurred in attempting to make network progress (psm_mq_ipeek).
|
||||
#
|
||||
[error posting receive]
|
||||
Unable to post application receive buffer (psm_mq_irecv).
|
||||
|
||||
Error: %s
|
||||
Buffer: %p
|
||||
Length: %d
|
@ -19,6 +19,8 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "ompi/mca/mtl/mtl.h"
|
||||
#include "ompi/runtime/ompi_module_exchange.h"
|
||||
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
||||
@ -62,7 +64,9 @@ ompi_mtl_psm_errhandler(psm_ep_t ep, const psm_error_t error,
|
||||
case PSM_EP_NO_PORTS_AVAIL:
|
||||
case PSM_EP_NO_NETWORK:
|
||||
case PSM_EP_INVALID_UUID_KEY:
|
||||
opal_output(0, "Open MPI failed to open a PSM endpoint: %s\n", error_string);
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"unable to open endpoint", true,
|
||||
psm_error_get_string(error));
|
||||
break;
|
||||
|
||||
/* We can't handle any other errors than the ones above */
|
||||
@ -77,7 +81,7 @@ ompi_mtl_psm_errhandler(psm_ep_t ep, const psm_error_t error,
|
||||
|
||||
int ompi_mtl_psm_progress( void );
|
||||
|
||||
int ompi_mtl_psm_module_init() {
|
||||
int ompi_mtl_psm_module_init(int local_rank, int num_local_procs) {
|
||||
psm_error_t err;
|
||||
psm_ep_t ep; /* endpoint handle */
|
||||
psm_mq_t mq;
|
||||
@ -86,6 +90,7 @@ int ompi_mtl_psm_module_init() {
|
||||
struct psm_ep_open_opts ep_opt;
|
||||
unsigned long long *uu = (unsigned long long *) unique_job_key;
|
||||
char *generated_key;
|
||||
char env_string[256];
|
||||
|
||||
generated_key = getenv("OMPI_MCA_orte_precondition_transports");
|
||||
memset(uu, 0, sizeof(psm_uuid_t));
|
||||
@ -93,17 +98,26 @@ int ompi_mtl_psm_module_init() {
|
||||
if (!generated_key || (strlen(generated_key) != 33) ||
|
||||
sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2)
|
||||
{
|
||||
opal_output(0, "Error obtaining unique transport key from ORTE "
|
||||
"(orte_precondition_transpots %s the environment)\n",
|
||||
generated_key ? "could not be parsed from" :
|
||||
"not present in");
|
||||
return OMPI_ERROR;
|
||||
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"no uuid present", true,
|
||||
generated_key ? "could not be parsed from" :
|
||||
"not present in", orte_process_info.nodename);
|
||||
return OMPI_ERROR;
|
||||
|
||||
}
|
||||
|
||||
/* Handle our own errors for opening endpoints */
|
||||
psm_error_register_handler(ompi_mtl_psm.ep, ompi_mtl_psm_errhandler);
|
||||
|
||||
|
||||
/* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM can allocate hardware
|
||||
* contexts correctly.
|
||||
*/
|
||||
snprintf(env_string, sizeof(env_string), "%d", local_rank);
|
||||
setenv("MPI_LOCALRANKID", env_string, 0);
|
||||
snprintf(env_string, sizeof(env_string), "%d", num_local_procs);
|
||||
setenv("MPI_LOCALNRANKS", env_string, 0);
|
||||
|
||||
/* Setup the endpoint options. */
|
||||
bzero((void*) &ep_opt, sizeof(ep_opt));
|
||||
ep_opt.timeout = ompi_mtl_psm.connect_timeout * 1e9;
|
||||
ep_opt.unit = ompi_mtl_psm.ib_unit;
|
||||
@ -121,9 +135,10 @@ int ompi_mtl_psm_module_init() {
|
||||
/* Open PSM endpoint */
|
||||
err = psm_ep_open(unique_job_key, &ep_opt, &ep, &epid);
|
||||
if (err) {
|
||||
opal_output(0, "Error in psm_ep_open (error %s)\n",
|
||||
psm_error_get_string(err));
|
||||
return OMPI_ERROR;
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"unable to open endpoint", true,
|
||||
psm_error_get_string(err));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Future errors are handled by the default error handler */
|
||||
@ -135,9 +150,10 @@ int ompi_mtl_psm_module_init() {
|
||||
0,
|
||||
&mq);
|
||||
if (err) {
|
||||
opal_output(0, "Error in psm_mq_init (error %s)\n",
|
||||
psm_error_get_string(err));
|
||||
return OMPI_ERROR;
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"psm init", true,
|
||||
psm_error_get_string(err));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ompi_mtl_psm.ep = ep;
|
||||
@ -151,7 +167,7 @@ int ompi_mtl_psm_module_init() {
|
||||
opal_output(0, "Open MPI couldn't send PSM epid to head node process");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* register the psm progress function */
|
||||
opal_progress_register(ompi_mtl_psm_progress);
|
||||
|
||||
@ -228,35 +244,39 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
psm_error_t *errs_out = NULL, err;
|
||||
size_t size;
|
||||
int proc_errors[PSM_ERROR_LAST] = { 0 };
|
||||
int timeout_in_secs;
|
||||
int proc, my_local_rank = -1, num_local_procs = 0, timeout_in_secs;
|
||||
|
||||
assert(mtl == &ompi_mtl_psm.super);
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
errs_out = (psm_error_t *) malloc(nprocs * sizeof(psm_error_t));
|
||||
if (errs_out == NULL)
|
||||
if (errs_out == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
epids_in = (psm_epid_t *) malloc(nprocs * sizeof(psm_epid_t));
|
||||
if (epids_in == NULL)
|
||||
if (epids_in == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t));
|
||||
if (epaddrs_out == NULL)
|
||||
if (epaddrs_out == NULL) {
|
||||
goto bail;
|
||||
|
||||
}
|
||||
rc = OMPI_SUCCESS;
|
||||
|
||||
/* Get the epids for all the processes from modex */
|
||||
for (i = 0; i < (int) nprocs; i++) {
|
||||
rc = ompi_modex_recv(&mca_mtl_psm_component.super.mtl_version,
|
||||
procs[i], (void**)&epid, &size);
|
||||
if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t))
|
||||
return OMPI_ERROR;
|
||||
if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
epids_in[i] = *epid;
|
||||
}
|
||||
|
||||
timeout_in_secs = min(180, 0.5 * nprocs);
|
||||
if (ompi_mtl_psm.connect_timeout < timeout_in_secs)
|
||||
if (ompi_mtl_psm.connect_timeout < timeout_in_secs) {
|
||||
timeout_in_secs = ompi_mtl_psm.connect_timeout;
|
||||
}
|
||||
|
||||
psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_NOP);
|
||||
|
||||
@ -281,8 +301,9 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
opal_output(0, "PSM EP connect error (%s):",
|
||||
errstr ? errstr : "unknown connect error");
|
||||
for (j = 0; j < (int) nprocs; j++) {
|
||||
if (errs_out[j] == thiserr)
|
||||
opal_output(0, " %s", procs[j]->proc_hostname);
|
||||
if (errs_out[j] == thiserr) {
|
||||
opal_output(0, " %s", procs[j]->proc_hostname);
|
||||
}
|
||||
}
|
||||
opal_output(0, "\n");
|
||||
}
|
||||
@ -308,12 +329,16 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
}
|
||||
|
||||
bail:
|
||||
if (epids_in != NULL)
|
||||
if (epids_in != NULL) {
|
||||
free(epids_in);
|
||||
if (errs_out != NULL)
|
||||
}
|
||||
if (errs_out != NULL) {
|
||||
free(errs_out);
|
||||
if (epaddrs_out != NULL)
|
||||
}
|
||||
if (epaddrs_out != NULL) {
|
||||
free(epaddrs_out);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -336,16 +361,18 @@ int ompi_mtl_psm_progress( void ) {
|
||||
|
||||
do {
|
||||
err = psm_mq_ipeek(ompi_mtl_psm.mq, &req, NULL);
|
||||
if (err == PSM_MQ_INCOMPLETE)
|
||||
if (err == PSM_MQ_INCOMPLETE) {
|
||||
return completed;
|
||||
else if (err != PSM_OK)
|
||||
} else if (err != PSM_OK) {
|
||||
goto error;
|
||||
|
||||
}
|
||||
|
||||
completed++;
|
||||
|
||||
err = psm_mq_test(&req, &psm_status);
|
||||
if (err != PSM_OK)
|
||||
if (err != PSM_OK) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
mtl_psm_request = (mca_mtl_psm_request_t*) psm_status.context;
|
||||
|
||||
@ -363,8 +390,9 @@ int ompi_mtl_psm_progress( void ) {
|
||||
}
|
||||
|
||||
if(mtl_psm_request->type == OMPI_MTL_PSM_ISEND) {
|
||||
if (mtl_psm_request->free_after)
|
||||
if (mtl_psm_request->free_after) {
|
||||
free(mtl_psm_request->buf);
|
||||
}
|
||||
}
|
||||
|
||||
switch (psm_status.error_code) {
|
||||
@ -387,8 +415,9 @@ int ompi_mtl_psm_progress( void ) {
|
||||
while (1);
|
||||
|
||||
error:
|
||||
opal_output(0, "Error in psm progress function: %s\n",
|
||||
psm_error_get_string(err));
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"error polling network", true,
|
||||
psm_error_get_string(err));
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -81,7 +81,7 @@ extern int ompi_mtl_psm_cancel(struct mca_mtl_base_module_t* mtl,
|
||||
|
||||
extern int ompi_mtl_psm_finalize(struct mca_mtl_base_module_t* mtl);
|
||||
|
||||
int ompi_mtl_psm_module_init(void);
|
||||
int ompi_mtl_psm_module_init(int local_rank, int num_local_procs);
|
||||
|
||||
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 QLogic Corporation. All rights reserved.
|
||||
* Copyright (c) 2006-2009 QLogic Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -19,9 +19,11 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#include "mtl_psm.h"
|
||||
#include "mtl_psm_types.h"
|
||||
@ -65,46 +67,47 @@ static int
|
||||
ompi_mtl_psm_component_open(void)
|
||||
{
|
||||
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"connect_timeout",
|
||||
"PSM connection timeout value in seconds",
|
||||
false, false, 30, &ompi_mtl_psm.connect_timeout);
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"connect_timeout",
|
||||
"PSM connection timeout value in seconds",
|
||||
false, false, 30, &ompi_mtl_psm.connect_timeout);
|
||||
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"debug",
|
||||
"PSM debug level",
|
||||
false, false, 1,
|
||||
&ompi_mtl_psm.debug_level);
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"debug",
|
||||
"PSM debug level",
|
||||
false, false, 1,
|
||||
&ompi_mtl_psm.debug_level);
|
||||
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_unit",
|
||||
"Truescale unit to use",
|
||||
false, false, -1,
|
||||
&ompi_mtl_psm.ib_unit);
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_unit",
|
||||
"Truescale unit to use",
|
||||
false, false, -1,
|
||||
&ompi_mtl_psm.ib_unit);
|
||||
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_port",
|
||||
"Truescale port on unit to use",
|
||||
false, false, 0,
|
||||
&ompi_mtl_psm.ib_port);
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_port",
|
||||
"Truescale port on unit to use",
|
||||
false, false, 0,
|
||||
&ompi_mtl_psm.ib_port);
|
||||
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_service_level",
|
||||
"Infiniband service level"
|
||||
"(0 <= SL <= 15)",
|
||||
false, false, 0, &ompi_mtl_psm.ib_service_level);
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_service_level",
|
||||
"Infiniband service level"
|
||||
"(0 <= SL <= 15)",
|
||||
false, false, 0, &ompi_mtl_psm.ib_service_level);
|
||||
|
||||
ompi_mtl_psm.ib_pkey = 0x7fffUL;
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_pkey",
|
||||
"Infiniband partition key",
|
||||
false, false, 0x7fffUL,
|
||||
&ompi_mtl_psm.ib_pkey);
|
||||
ompi_mtl_psm.ib_pkey = 0x7fffUL;
|
||||
mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version,
|
||||
"ib_pkey",
|
||||
"Infiniband partition key",
|
||||
false, false, 0x7fffUL,
|
||||
&ompi_mtl_psm.ib_pkey);
|
||||
|
||||
if (ompi_mtl_psm.ib_service_level < 0)
|
||||
ompi_mtl_psm.ib_service_level = 0;
|
||||
else if (ompi_mtl_psm.ib_service_level > 15)
|
||||
ompi_mtl_psm.ib_service_level = 15;
|
||||
if (ompi_mtl_psm.ib_service_level < 0) {
|
||||
ompi_mtl_psm.ib_service_level = 0;
|
||||
} else if (ompi_mtl_psm.ib_service_level > 15) {
|
||||
ompi_mtl_psm.ib_service_level = 15;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
@ -123,9 +126,40 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
psm_error_t err;
|
||||
int rc;
|
||||
int verno_major = PSM_VERNO_MAJOR;
|
||||
int verno_minor = PSM_VERNO_MINOR;
|
||||
|
||||
ompi_proc_t *my_proc, **procs;
|
||||
size_t num_total_procs;
|
||||
int local_rank = -1, num_local_procs = 0, proc;
|
||||
|
||||
/* Compute the total number of processes on this host and our local rank
|
||||
* on that node. We need to provide PSM with these values so it can
|
||||
* allocate hardware contexts appropriately across processes.
|
||||
*/
|
||||
if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
my_proc = ompi_proc_local();
|
||||
if (NULL == (procs = ompi_proc_world(&num_total_procs))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (proc = 0; proc < num_total_procs; proc++) {
|
||||
if (my_proc == procs[proc]) {
|
||||
local_rank = num_local_procs++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
|
||||
num_local_procs++;
|
||||
}
|
||||
}
|
||||
|
||||
assert(local_rank >= 0 && num_local_procs > 0);
|
||||
free(procs);
|
||||
|
||||
err = psm_error_register_handler(NULL /* no ep */,
|
||||
PSM_ERRHANDLER_NOP);
|
||||
if (err) {
|
||||
@ -141,34 +175,37 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
|
||||
sizeof(unsigned));
|
||||
if (err) {
|
||||
/* Non fatal error. Can continue */
|
||||
opal_output(0, "Unable to set infinipath debug level (error %s)\n",
|
||||
psm_error_get_string(err));
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"psm init", false,
|
||||
psm_error_get_string(err));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Only allow for shm and ipath devices in 2.0 and earlier releases
|
||||
* (unless the user overrides the setting).
|
||||
*/
|
||||
setenv("PSM_DEVICES", "shm,ipath", 0);
|
||||
|
||||
|
||||
if (PSM_VERNO >= 0x0104) {
|
||||
setenv("PSM_DEVICES", "self,shm,ipath", 0);
|
||||
}
|
||||
else {
|
||||
setenv("PSM_DEVICES", "shm,ipath", 0);
|
||||
}
|
||||
|
||||
err = psm_init(&verno_major, &verno_minor);
|
||||
if (err) {
|
||||
opal_output(0, "Error in psm_init (error %s)\n",
|
||||
psm_error_get_string(err));
|
||||
return NULL;
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"psm init", true,
|
||||
psm_error_get_string(err));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable 'self' device only in a post-2.0 release(s)
|
||||
*/
|
||||
if (PSM_VERNO >= 0x0104)
|
||||
setenv("PSM_DEVICES", "self,shm,ipath", 0);
|
||||
|
||||
ompi_mtl_psm_module_init();
|
||||
|
||||
/* Complete PSM initialization */
|
||||
ompi_mtl_psm_module_init(local_rank, num_local_procs);
|
||||
|
||||
ompi_mtl_psm.super.mtl_request_size =
|
||||
sizeof(mca_mtl_psm_request_t) -
|
||||
sizeof(struct mca_mtl_request_t);
|
||||
sizeof(mca_mtl_psm_request_t) -
|
||||
sizeof(struct mca_mtl_request_t);
|
||||
|
||||
return &ompi_mtl_psm.super;
|
||||
}
|
||||
|
@ -68,8 +68,11 @@ ompi_mtl_psm_irecv(struct mca_mtl_base_module_t* mtl,
|
||||
&mtl_psm_request->psm_request);
|
||||
|
||||
if (err) {
|
||||
opal_output(0, "Error in psm_mq_irecv (error %s)\n", psm_error_get_string(err));
|
||||
return OMPI_ERROR;
|
||||
orte_show_help("help-mtl-psm.txt",
|
||||
"error posting receive", true,
|
||||
psm_error_get_string(err),
|
||||
mtl_psm_request->buf, length);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -70,8 +70,9 @@ ompi_mtl_psm_send(struct mca_mtl_base_module_t* mtl,
|
||||
mtl_psm_request.buf,
|
||||
length);
|
||||
|
||||
if (mtl_psm_request.free_after)
|
||||
if (mtl_psm_request.free_after) {
|
||||
free(mtl_psm_request.buf);
|
||||
}
|
||||
|
||||
return err == PSM_OK ? OMPI_SUCCESS : OMPI_ERROR;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user