When directly launched by an RM, flag that a process is operating without daemons - i.e., standalone. Provide an error string for the new socket_not_available error. Use errmgr.abort to exit when we cannot get a socket, and ensure that the slurmd module returns the proper exit status for slurm 2.0
This commit was SVN r21868.
Этот коммит содержится в:
родитель
7370235c3e
Коммит
509cc0553c
@ -59,6 +59,7 @@
|
||||
|
||||
static int rte_init(void);
|
||||
static int rte_finalize(void);
|
||||
static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__;
|
||||
static uint8_t proc_get_locality(orte_process_name_t *proc);
|
||||
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc);
|
||||
static char* proc_get_hostname(orte_process_name_t *proc);
|
||||
@ -70,7 +71,7 @@ static int update_nidmap(opal_byte_object_t *bo);
|
||||
orte_ess_base_module_t orte_ess_slurmd_module = {
|
||||
rte_init,
|
||||
rte_finalize,
|
||||
orte_ess_base_app_abort,
|
||||
rte_abort,
|
||||
proc_get_locality,
|
||||
proc_get_daemon,
|
||||
proc_get_hostname,
|
||||
@ -124,6 +125,9 @@ static int rte_init(void)
|
||||
* provided that info in our environment, so get it from there
|
||||
*/
|
||||
|
||||
/* declare ourselves to be standalone - i.e., not launched by orted */
|
||||
orte_standalone_operation = true;
|
||||
|
||||
/* get the slurm jobid - this will be our job family */
|
||||
envar = getenv("SLURM_JOBID");
|
||||
/* don't need to check this for NULL - if it was, we would
|
||||
@ -353,11 +357,6 @@ static int rte_init(void)
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == ret && slurm20) {
|
||||
/* exit silently with a special error code for slurm 2.0 */
|
||||
exit(108);
|
||||
}
|
||||
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
@ -384,6 +383,16 @@ static int rte_finalize(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void rte_abort(int error_code, bool report)
|
||||
{
|
||||
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
|
||||
/* exit silently with a special error code for slurm 2.0 */
|
||||
orte_ess_base_app_abort(108, false);
|
||||
} else {
|
||||
orte_ess_base_app_abort(error_code, report);
|
||||
}
|
||||
}
|
||||
|
||||
static uint8_t proc_get_locality(orte_process_name_t *proc)
|
||||
{
|
||||
orte_nid_t *nid;
|
||||
|
@ -546,7 +546,6 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
struct sockaddr_storage inaddr;
|
||||
opal_socklen_t addrlen;
|
||||
char **ports=NULL;
|
||||
char *ctmp;
|
||||
|
||||
/* create a listen socket for incoming connections */
|
||||
*target_sd = socket(af_family, SOCK_STREAM, 0);
|
||||
@ -786,17 +785,15 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
|
||||
}
|
||||
|
||||
/* if we reach this point, then no socket could be found in the specified
|
||||
* range that was available to us, so report the error
|
||||
*/
|
||||
ctmp = opal_argv_join(ports, ',');
|
||||
opal_output(0, "%s oob:tcp:bind() failed - no port available in specified list:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ctmp);
|
||||
free(ctmp);
|
||||
|
||||
/* cleanup and return the error */
|
||||
/* cleanup */
|
||||
CLOSE_THE_SOCKET(*target_sd);
|
||||
opal_argv_free(ports);
|
||||
if (orte_standalone_operation) {
|
||||
/* if we are running as a standalone app - i.e., one
|
||||
* not launched by orteds - then abort
|
||||
*/
|
||||
orte_errmgr.abort(ORTE_ERR_SOCKET_NOT_AVAILABLE, NULL);
|
||||
}
|
||||
return ORTE_ERR_SOCKET_NOT_AVAILABLE;
|
||||
|
||||
|
||||
|
@ -60,6 +60,7 @@ bool orted_spin_flag = false;
|
||||
/* ORTE OOB port flags */
|
||||
bool orte_static_ports = false;
|
||||
char *orte_oob_static_ports = NULL;
|
||||
bool orte_standalone_operation = false;
|
||||
|
||||
bool orte_keep_fqdn_hostnames = false;
|
||||
bool orte_show_resolved_nodenames;
|
||||
|
@ -491,6 +491,7 @@ ORTE_DECLSPEC extern bool orted_spin_flag;
|
||||
/* ORTE OOB port flags */
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
ORTE_DECLSPEC extern char *orte_oob_static_ports;
|
||||
ORTE_DECLSPEC extern bool orte_standalone_operation;
|
||||
|
||||
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
||||
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
||||
|
@ -252,6 +252,14 @@ while attempting to start process rank %lu.
|
||||
Error name: %s
|
||||
Node: %s
|
||||
|
||||
when attempting to start process rank %lu.
|
||||
#
|
||||
[orterun:proc-socket-not-avail]
|
||||
%s was unable to start the specified application as it encountered an error:
|
||||
|
||||
Error name: %s
|
||||
Node: %s
|
||||
|
||||
when attempting to start process rank %lu.
|
||||
#
|
||||
[orterun:proc-failed-to-start-no-status]
|
||||
|
@ -1004,6 +1004,10 @@ static void dump_aborted_procs(void)
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else if (ORTE_ERR_SOCKET_NOT_AVAILABLE == proc->exit_code) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
|
||||
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
|
||||
(unsigned long)proc->name.vpid);
|
||||
} else {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
|
||||
orterun_basename, proc->node->name);
|
||||
|
@ -118,6 +118,9 @@ orte_err2str(int errnum)
|
||||
case ORTE_ERR_SYS_LIMITS_SOCKETS:
|
||||
retval = "The system limit on number of network connections a process can open was reached";
|
||||
break;
|
||||
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
|
||||
retval = "Unable to open a TCP socket for out-of-band communications";
|
||||
break;
|
||||
default:
|
||||
retval = NULL;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user