Fix for a problem on SLURM we have neen having since r12243 where mpirun would hang after the process had finished. It turns out that we were always reporting the name of the daemon wrong, but we simply never noticed as we never used it, until r12243. This makes it so we report the name of the daemon correctly.
This commit was SVN r12274. The following SVN revision numbers were found above: r12243 --> open-mpi/ompi@153e38ffc9
Этот коммит содержится в:
родитель
26781a567d
Коммит
93d61d01fb
@ -107,6 +107,11 @@ static int pls_slurm_open(void)
|
|||||||
mca_base_param_reg_int(comp, "debug", "Enable debugging of slurm pls",
|
mca_base_param_reg_int(comp, "debug", "Enable debugging of slurm pls",
|
||||||
false, false, 0,
|
false, false, 0,
|
||||||
&mca_pls_slurm_component.debug);
|
&mca_pls_slurm_component.debug);
|
||||||
|
if (mca_pls_slurm_component.debug == 0) {
|
||||||
|
mca_base_param_reg_int_name("orte", "debug",
|
||||||
|
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
|
||||||
|
false, false, false, &mca_pls_slurm_component.debug);
|
||||||
|
}
|
||||||
|
|
||||||
mca_base_param_reg_int(comp, "priority", "Default selection priority",
|
mca_base_param_reg_int(comp, "priority", "Default selection priority",
|
||||||
false, false, 75,
|
false, false, 75,
|
||||||
|
@ -113,6 +113,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
|||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
size_t num_nodes;
|
size_t num_nodes;
|
||||||
orte_vpid_t vpid;
|
orte_vpid_t vpid;
|
||||||
|
orte_vpid_t start_vpid;
|
||||||
char *jobid_string;
|
char *jobid_string;
|
||||||
char *uri, *param;
|
char *uri, *param;
|
||||||
char **argv;
|
char **argv;
|
||||||
@ -159,6 +160,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
|||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
start_vpid = vpid;
|
||||||
|
|
||||||
/* setup the orted triggers for passing their launch info */
|
/* setup the orted triggers for passing their launch info */
|
||||||
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
|
if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
|
||||||
@ -338,7 +340,7 @@ static int pls_slurm_launch_job(orte_jobid_t jobid)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup the daemon info for each node */
|
/* setup the daemon info for each node */
|
||||||
vpid = 0;
|
vpid = start_vpid;
|
||||||
for (item = opal_list_get_first(&map->nodes);
|
for (item = opal_list_get_first(&map->nodes);
|
||||||
item != opal_list_get_end(&map->nodes);
|
item != opal_list_get_end(&map->nodes);
|
||||||
item = opal_list_get_next(item)) {
|
item = opal_list_get_next(item)) {
|
||||||
@ -559,6 +561,7 @@ static int pls_slurm_start_proc(int argc, char **argv, char **env,
|
|||||||
|
|
||||||
/* When not in debug mode, tie stdout/stderr to dev null so we
|
/* When not in debug mode, tie stdout/stderr to dev null so we
|
||||||
don't see messages from orted */
|
don't see messages from orted */
|
||||||
|
/* XXX: this prevents --debug-daemons from working */
|
||||||
if (!mca_pls_slurm_component.debug) {
|
if (!mca_pls_slurm_component.debug) {
|
||||||
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
|
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
|
||||||
if (fd >= 0) {
|
if (fd >= 0) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user