Fixes to enable mpirun to work again on Cray
The ess pmi module was not handling aprun launched daemons. All daemons were thinking they were vpid 1. Also, turns out that on cray systems using MOM nodes for launched jobs, just detecting whether or not a process is in a PAGG container is not sufficient. Crank up the priority of the alps PLM component in the event that the configure detected the presence of both slurm and alps. Have the ESS pmi component open the pmix framework and select a pmix component. This commit was SVN r32773.
Этот коммит содержится в:
родитель
f2e586980b
Коммит
1508a01325
@ -30,6 +30,7 @@ const char *opal_pmix_cray_component_version_string =
|
||||
* Local function
|
||||
*/
|
||||
static int pmix_cray_component_query(mca_base_module_t **module, int *priority);
|
||||
static int pmix_cray_component_close(void);
|
||||
|
||||
|
||||
/*
|
||||
@ -58,7 +59,7 @@ const opal_pmix_base_component_t mca_pmix_cray_component = {
|
||||
/* Component open and close functions */
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
pmix_cray_component_close,
|
||||
pmix_cray_component_query,
|
||||
NULL
|
||||
},
|
||||
@ -85,7 +86,11 @@ static int pmix_cray_component_query(mca_base_module_t **module, int *priority)
|
||||
rc = OPAL_ERROR;
|
||||
} else {
|
||||
tmp = fgets(string, sizeof(string), fd);
|
||||
if (tmp) { /* okay we're in a PAGG container, got non-null output from job device */
|
||||
if (tmp && (getenv("ALPS_APP_DEPTH") != NULL)) { /* okay we're in a PAGG container,
|
||||
got non-null output from job device.
|
||||
The check for ALPS_APP_DEPTH is required
|
||||
since on systems using MOM nodes, the
|
||||
mpirun/orte is actually in a PAGG */
|
||||
*priority = 90;
|
||||
*module = (mca_base_module_t *)&opal_pmix_cray_module;
|
||||
rc = OPAL_SUCCESS;
|
||||
@ -96,4 +101,12 @@ static int pmix_cray_component_query(mca_base_module_t **module, int *priority)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int pmix_cray_component_close(void)
|
||||
{
|
||||
int ret = OPAL_SUCCESS;
|
||||
|
||||
ret = opal_pmix_cray_module.finalize();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
#include "opal/runtime/opal_params.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
@ -58,9 +59,17 @@ orte_ess_base_component_t mca_ess_pmi_component = {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int pmi_component_open(void)
|
||||
{
|
||||
|
||||
if (OPAL_SUCCESS != mca_base_framework_open(&opal_pmix_base_framework, 0)) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != opal_pmix_base_select()) {
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -87,6 +96,6 @@ static int pmi_component_close(void)
|
||||
if (NULL != opal_pmix.finalize) {
|
||||
opal_pmix.finalize(); // balances query
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
return mca_base_framework_close(&opal_pmix_base_framework);
|
||||
}
|
||||
|
||||
|
@ -88,6 +88,7 @@ static int rte_init(void)
|
||||
char *rmluri;
|
||||
opal_value_t *kv, kvn;
|
||||
opal_list_t vals;
|
||||
orte_vpid_t starting_vpid = 1; // compensate for orterun
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -125,17 +126,24 @@ static int rte_init(void)
|
||||
}
|
||||
ORTE_PROC_MY_NAME->jobid = jobid;
|
||||
|
||||
/* if we weren't given it, get our global rank from PMI */
|
||||
if (NULL == orte_ess_base_vpid) {
|
||||
if (!opal_pmix.get_attr(PMIX_RANK, &kv)) {
|
||||
error = "getting rank";
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
goto error;
|
||||
if (NULL != orte_ess_base_vpid) {
|
||||
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&starting_vpid,
|
||||
orte_ess_base_vpid))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return(ret);
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = kv->data.uint32 + 1; // compensate for orterun
|
||||
OBJ_RELEASE(kv);
|
||||
}
|
||||
|
||||
if (!opal_pmix.get_attr(PMIX_RANK, &kv)) {
|
||||
error = "getting rank";
|
||||
ret = ORTE_ERR_NOT_FOUND;
|
||||
goto error;
|
||||
}
|
||||
|
||||
ORTE_PROC_MY_NAME->vpid = kv->data.uint32 + starting_vpid;
|
||||
fprintf(stderr,"Setting my vpid to %d\n",ORTE_PROC_MY_NAME->vpid);
|
||||
OBJ_RELEASE(kv);
|
||||
|
||||
/* if we weren't given it, get universe size */
|
||||
if (orte_ess_base_num_procs < 0) {
|
||||
if (!opal_pmix.get_attr(PMIX_UNIV_SIZE, &kv)) {
|
||||
|
@ -103,7 +103,7 @@ static int plm_alps_register(void)
|
||||
mca_plm_alps_component.debug = orte_debug_flag;
|
||||
}
|
||||
|
||||
mca_plm_alps_component.priority = 75;
|
||||
mca_plm_alps_component.priority = 100;
|
||||
(void) mca_base_component_var_register (comp, "priority", "Default selection priority",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
|
@ -165,7 +165,8 @@ ras_alps_register(void)
|
||||
NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &ras_alps_read_attempts);
|
||||
|
||||
ras_alps_apstat_cmd = "/usr/bin/apstat";
|
||||
ras_alps_apstat_cmd = "apstat"; /* by default apstat is in a user's path on a Cray XE/XC if
|
||||
alps is the site's job launcher */
|
||||
(void) mca_base_component_var_register (&mca_ras_alps_component.base_version,
|
||||
"apstat_cmd", "Location of the apstat command",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_6,
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user