Bring in changes to support Cray's Compute Node Linux (CNL) and
Application Level Placement Scheduler (ALPS). This commit was tested under two Cray machines at ORNL: Jaguar (Catamount) and Rizzo (CNL Test cage). Both machines performed as they should across the commit. It is likely that mor changes will follow this the work and environment stabilizes. Most of the infrastructure works the same for Catamount and CNL except for a few bits. Below are the highlights: Default IFACE Change: On Catamount we can use PTL_IFACE_DEFAULT, but on the CNL system we have access to will fail on this interface, and should be set to: IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS). So if we detect that we are running with YOD then use the former interface and if we detect that we are running with ALPS then use the latter. We will want to pursue a more elegant solution if this interface continues to change across machines. PtlGetId and cnos_register_ptlid: The header suggests that these should never be called when launching with YOD. But in the ALPS environment the cnos_barrier() will hang forever if these functions are not called after PtlNIInit(). Since these functions only need to be called once, and the orte rmgr/cnos component is loaded before the ompi common/portals componet then just call these functions once in the rmgr/cnos component. cnos_barrier_init(): This is a noop for YOD, but critical for ALPS. So be sure to call it before calling the first barrier in the rmgr/cnos component. cnos_barrier vs cnos_pm_barrier: It is suggested the cnos_pm_barrier only be used during finalization as it will indicate to the launcher (yod or aprun) that the app is about to complete. It was suggested that we use the regular cnos_barrier() instead. I want to look into this a bit more to make sure there are not adverse side effects. A note has been placed in the code to indicate this reasoning. This commit was SVN r15756.
Этот коммит содержится в:
родитель
6248b2bb51
Коммит
755658694e
@ -61,20 +61,43 @@ ompi_common_portals_initialize(void)
|
||||
int
|
||||
ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
|
||||
{
|
||||
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
|
||||
int max_interfaces;
|
||||
int launcher;
|
||||
int ret;
|
||||
|
||||
launcher = cnos_launcher();
|
||||
|
||||
/*
|
||||
* If we use the YOD launcher we can use the default interface
|
||||
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
|
||||
*/
|
||||
if( launcher != CNOS_LAUNCHER_YOD ) {
|
||||
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize Portals interface
|
||||
*/
|
||||
ret = PtlInit(&max_interfaces);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(0, "%5d: PtlInit failed, returning %d\n",
|
||||
cnos_get_rank(), ret);
|
||||
return OMPI_ERR_NOT_AVAILABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize a network device
|
||||
*/
|
||||
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
|
||||
ret = PtlNIInit(ni_iface, /* interface to initialize */
|
||||
PTL_PID_ANY, /* let library assign our pid */
|
||||
NULL, /* no desired limits */
|
||||
NULL, /* actual limits */
|
||||
ni_handle /* our interface handle */
|
||||
);
|
||||
if (PTL_OK != ret && PTL_IFACE_DUP != ret) {
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
|
||||
cnos_get_rank(), ret);
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
|
||||
cnos_get_rank(), ret, __FILE__, __LINE__);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
@ -113,6 +113,18 @@ ompi_common_portals_initialize(void)
|
||||
information */
|
||||
int max_interfaces;
|
||||
unsigned int nptl_procs, rank;
|
||||
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
|
||||
int launcher;
|
||||
|
||||
launcher = cnos_launcher();
|
||||
|
||||
/*
|
||||
* If we use the YOD launcher we can use the default interface
|
||||
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
|
||||
*/
|
||||
if( launcher != CNOS_LAUNCHER_YOD ) {
|
||||
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
|
||||
}
|
||||
|
||||
ret = PtlInit(&max_interfaces);
|
||||
if (PTL_OK != ret) {
|
||||
@ -126,15 +138,15 @@ ompi_common_portals_initialize(void)
|
||||
PtlSetRank(PTL_INVALID_HANDLE, -1, -1);
|
||||
|
||||
/* Initialize a network device */
|
||||
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
|
||||
ret = PtlNIInit(ni_iface, /* interface to initialize */
|
||||
PTL_PID_ANY, /* let library assign our pid */
|
||||
NULL, /* no desired limits */
|
||||
NULL, /* no need to have limits around */
|
||||
&active_ni_h /* our interface handle */
|
||||
);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
|
||||
getpid(), ret);
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
|
||||
getpid(), ret, __FILE__, __LINE__);
|
||||
return OMPI_ERR_FATAL;
|
||||
}
|
||||
|
||||
@ -185,6 +197,18 @@ ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
|
||||
char *tmp;
|
||||
ompi_proc_t* proc_self = ompi_proc_local();
|
||||
int max_interfaces;
|
||||
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
|
||||
int launcher;
|
||||
|
||||
launcher = cnos_launcher();
|
||||
|
||||
/*
|
||||
* If we use the YOD launcher we can use the default interface
|
||||
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
|
||||
*/
|
||||
if( launcher != CNOS_LAUNCHER_YOD ) {
|
||||
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
|
||||
}
|
||||
|
||||
/* get our world */
|
||||
procs = ompi_proc_world(&nprocs);
|
||||
@ -256,15 +280,15 @@ ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
|
||||
PtlSetRank(PTL_INVALID_HANDLE, -1, -1);
|
||||
|
||||
/* Initialize a network device */
|
||||
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
|
||||
ret = PtlNIInit(ni_iface, /* interface to initialize */
|
||||
PTL_PID_ANY, /* let library assign our pid */
|
||||
NULL, /* no desired limits */
|
||||
NULL, /* no need to have limits around */
|
||||
&active_ni_h /* our interface handle */
|
||||
);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
|
||||
getpid(), ret);
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
|
||||
getpid(), ret, __FILE__, __LINE__);
|
||||
return OMPI_ERR_FATAL;
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
#ifdef HAVE_CNOS_PM_BARRIER
|
||||
#include <catamount/cnos_mpi_os.h>
|
||||
#include <portals/portals3.h>
|
||||
#endif
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
@ -79,14 +80,88 @@ static int orte_rmgr_cnos_open(void)
|
||||
|
||||
static orte_rmgr_base_module_t *orte_rmgr_cnos_init(int* priority)
|
||||
{
|
||||
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
|
||||
ptl_handle_ni_t ni_handle;
|
||||
int ret, max_interfaces;
|
||||
ptl_process_id_t ptl_process_id;
|
||||
int launcher;
|
||||
|
||||
/* set a priority higher than the proxy component */
|
||||
*priority = 10;
|
||||
|
||||
launcher = cnos_launcher();
|
||||
|
||||
/*
|
||||
* If we use the YOD launcher we can use the default interface
|
||||
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
|
||||
*/
|
||||
if( launcher != CNOS_LAUNCHER_YOD ) {
|
||||
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize Portals interface
|
||||
*/
|
||||
ret = PtlInit(&max_interfaces);
|
||||
if (PTL_OK != ret) {
|
||||
opal_output(0, "%5d: PtlInit failed, returning %d\n",
|
||||
cnos_get_rank(), ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize a network device
|
||||
*/
|
||||
ret = PtlNIInit(ni_iface, /* interface to initialize */
|
||||
PTL_PID_ANY, /* let library assign our pid */
|
||||
NULL, /* no desired limits */
|
||||
NULL, /* actual limits */
|
||||
&ni_handle /* our interface handle */
|
||||
);
|
||||
if (PTL_OK != ret && PTL_IFACE_DUP != ret) {
|
||||
opal_output(0, "%5d: PtlNIInit failed, returning %d [%s : %d]\n",
|
||||
cnos_get_rank(), ret, __FILE__, __LINE__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the Barrier
|
||||
* Note: No return value, assume success
|
||||
*/
|
||||
cnos_barrier_init(ni_handle);
|
||||
|
||||
/*
|
||||
* Register the ptl_process_id if *not* using yod.
|
||||
* If you do *not* do this before calling the barrier it will hang forever.
|
||||
*/
|
||||
if( launcher != CNOS_LAUNCHER_YOD ) {
|
||||
ret = PtlGetId(ni_handle, &ptl_process_id);
|
||||
if( PTL_OK != ret ) {
|
||||
opal_output(0, "%5d: PtlGetId failed, returning %d\n",
|
||||
cnos_get_rank(), ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = cnos_register_ptlid(ptl_process_id);
|
||||
if( PTL_OK != ret ) {
|
||||
opal_output(0, "%5d: cnos_register_ptlid failed, returning %d\n",
|
||||
cnos_get_rank(), ret);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_CNOS_PM_BARRIER
|
||||
/* register with the process manager so that everyone aborts if
|
||||
any one process aborts. This is a bit slower than it needs to
|
||||
be, but useful. */
|
||||
cnos_pm_barrier(0);
|
||||
/*
|
||||
* Do not use cnos_pm_barrier() as that serves as a indicator to
|
||||
* the launcher that the job is exiting. Instead always use the
|
||||
* normal cnos_barrier().
|
||||
* JJH Double check:
|
||||
* register with the process manager so that everyone aborts if
|
||||
* any one process aborts. This is a bit slower than it needs to
|
||||
* be, but useful.
|
||||
* Replaced: cnos_pm_barrier(0);
|
||||
*/
|
||||
cnos_barrier();
|
||||
#endif
|
||||
|
||||
return &orte_rmgr_cnos_module;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user