1
1

Bring in changes to support Cray's Compute Node Linux (CNL) and

Application Level Placement Scheduler (ALPS).

This commit was tested under two Cray machines at ORNL: Jaguar (Catamount)
and Rizzo (CNL Test cage). Both machines performed as they should across
the commit.

It is likely that mor changes will follow this the work and environment
stabilizes.

Most of the infrastructure works the same for Catamount and CNL
except for a few bits. Below are the highlights:

Default IFACE Change:
 On Catamount we can use PTL_IFACE_DEFAULT, but on the CNL system we have access
 to will fail on this interface, and should be set to:
    IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS).
 So if we detect that we are running with YOD then use the former interface
 and if we detect that we are running with ALPS then use the latter.
 We will want to pursue a more elegant solution if this interface continues to 
 change across machines.

PtlGetId and cnos_register_ptlid:
 The header suggests that these should never be called when launching with YOD.
 But in the ALPS environment the cnos_barrier() will hang forever if these 
 functions are not called after PtlNIInit(). Since these functions only need to
 be called once, and the orte rmgr/cnos component is loaded before the ompi 
 common/portals componet then just call these functions once in the rmgr/cnos
 component.

cnos_barrier_init():
 This is a noop for YOD, but critical for ALPS. So be sure to call it before
 calling the first barrier in the rmgr/cnos component.

cnos_barrier vs cnos_pm_barrier:
 It is suggested the cnos_pm_barrier only be used during finalization 
 as it will indicate to the launcher (yod or aprun) that the app is about
 to complete. It was suggested that we use the regular cnos_barrier() instead.
 I want to look into this a bit more to make sure there are not adverse
 side effects. A note has been placed in the code to indicate this reasoning.

This commit was SVN r15756.
Этот коммит содержится в:
Josh Hursey 2007-08-03 19:46:38 +00:00
родитель 6248b2bb51
Коммит 755658694e
3 изменённых файлов: 135 добавлений и 13 удалений

Просмотреть файл

@ -61,20 +61,43 @@ ompi_common_portals_initialize(void)
int
ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
{
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
int max_interfaces;
int launcher;
int ret;
launcher = cnos_launcher();
/*
* If we use the YOD launcher we can use the default interface
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
*/
if( launcher != CNOS_LAUNCHER_YOD ) {
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
}
/*
* Initialize Portals interface
*/
ret = PtlInit(&max_interfaces);
if (PTL_OK != ret) {
opal_output(0, "%5d: PtlInit failed, returning %d\n",
cnos_get_rank(), ret);
return OMPI_ERR_NOT_AVAILABLE;
}
/*
* Initialize a network device
*/
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
ret = PtlNIInit(ni_iface, /* interface to initialize */
PTL_PID_ANY, /* let library assign our pid */
NULL, /* no desired limits */
NULL, /* actual limits */
ni_handle /* our interface handle */
);
if (PTL_OK != ret && PTL_IFACE_DUP != ret) {
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
cnos_get_rank(), ret);
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
cnos_get_rank(), ret, __FILE__, __LINE__);
return OMPI_ERROR;
}

Просмотреть файл

@ -113,6 +113,18 @@ ompi_common_portals_initialize(void)
information */
int max_interfaces;
unsigned int nptl_procs, rank;
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
int launcher;
launcher = cnos_launcher();
/*
* If we use the YOD launcher we can use the default interface
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
*/
if( launcher != CNOS_LAUNCHER_YOD ) {
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
}
ret = PtlInit(&max_interfaces);
if (PTL_OK != ret) {
@ -126,15 +138,15 @@ ompi_common_portals_initialize(void)
PtlSetRank(PTL_INVALID_HANDLE, -1, -1);
/* Initialize a network device */
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
ret = PtlNIInit(ni_iface, /* interface to initialize */
PTL_PID_ANY, /* let library assign our pid */
NULL, /* no desired limits */
NULL, /* no need to have limits around */
&active_ni_h /* our interface handle */
);
if (PTL_OK != ret) {
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
getpid(), ret);
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
getpid(), ret, __FILE__, __LINE__);
return OMPI_ERR_FATAL;
}
@ -185,6 +197,18 @@ ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
char *tmp;
ompi_proc_t* proc_self = ompi_proc_local();
int max_interfaces;
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
int launcher;
launcher = cnos_launcher();
/*
* If we use the YOD launcher we can use the default interface
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
*/
if( launcher != CNOS_LAUNCHER_YOD ) {
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
}
/* get our world */
procs = ompi_proc_world(&nprocs);
@ -256,15 +280,15 @@ ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle)
PtlSetRank(PTL_INVALID_HANDLE, -1, -1);
/* Initialize a network device */
ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
ret = PtlNIInit(ni_iface, /* interface to initialize */
PTL_PID_ANY, /* let library assign our pid */
NULL, /* no desired limits */
NULL, /* no need to have limits around */
&active_ni_h /* our interface handle */
);
if (PTL_OK != ret) {
opal_output(0, "%5d: PtlNIInit failed, returning %d\n",
getpid(), ret);
opal_output(0, "%5d: PtlNIInit failed, returning %d (%s : %d)\n",
getpid(), ret, __FILE__, __LINE__);
return OMPI_ERR_FATAL;
}

Просмотреть файл

@ -18,6 +18,7 @@
#ifdef HAVE_CNOS_PM_BARRIER
#include <catamount/cnos_mpi_os.h>
#include <portals/portals3.h>
#endif
#include "orte/orte_constants.h"
@ -79,14 +80,88 @@ static int orte_rmgr_cnos_open(void)
static orte_rmgr_base_module_t *orte_rmgr_cnos_init(int* priority)
{
ptl_interface_t ni_iface = PTL_IFACE_DEFAULT;
ptl_handle_ni_t ni_handle;
int ret, max_interfaces;
ptl_process_id_t ptl_process_id;
int launcher;
/* set a priority higher than the proxy component */
*priority = 10;
launcher = cnos_launcher();
/*
* If we use the YOD launcher we can use the default interface
* otherwise we need to use the SeaStar Bridged interface (for CNL/APRUN)
*/
if( launcher != CNOS_LAUNCHER_YOD ) {
ni_iface = IFACE_FROM_BRIDGE_AND_NALID(PTL_BRIDGE_UK,PTL_IFACE_SS);
}
/*
* Initialize Portals interface
*/
ret = PtlInit(&max_interfaces);
if (PTL_OK != ret) {
opal_output(0, "%5d: PtlInit failed, returning %d\n",
cnos_get_rank(), ret);
return NULL;
}
/*
* Initialize a network device
*/
ret = PtlNIInit(ni_iface, /* interface to initialize */
PTL_PID_ANY, /* let library assign our pid */
NULL, /* no desired limits */
NULL, /* actual limits */
&ni_handle /* our interface handle */
);
if (PTL_OK != ret && PTL_IFACE_DUP != ret) {
opal_output(0, "%5d: PtlNIInit failed, returning %d [%s : %d]\n",
cnos_get_rank(), ret, __FILE__, __LINE__);
return NULL;
}
/*
* Initialize the Barrier
* Note: No return value, assume success
*/
cnos_barrier_init(ni_handle);
/*
* Register the ptl_process_id if *not* using yod.
* If you do *not* do this before calling the barrier it will hang forever.
*/
if( launcher != CNOS_LAUNCHER_YOD ) {
ret = PtlGetId(ni_handle, &ptl_process_id);
if( PTL_OK != ret ) {
opal_output(0, "%5d: PtlGetId failed, returning %d\n",
cnos_get_rank(), ret);
return NULL;
}
ret = cnos_register_ptlid(ptl_process_id);
if( PTL_OK != ret ) {
opal_output(0, "%5d: cnos_register_ptlid failed, returning %d\n",
cnos_get_rank(), ret);
return NULL;
}
}
#ifdef HAVE_CNOS_PM_BARRIER
/* register with the process manager so that everyone aborts if
any one process aborts. This is a bit slower than it needs to
be, but useful. */
cnos_pm_barrier(0);
/*
* Do not use cnos_pm_barrier() as that serves as a indicator to
* the launcher that the job is exiting. Instead always use the
* normal cnos_barrier().
* JJH Double check:
* register with the process manager so that everyone aborts if
* any one process aborts. This is a bit slower than it needs to
* be, but useful.
* Replaced: cnos_pm_barrier(0);
*/
cnos_barrier();
#endif
return &orte_rmgr_cnos_module;