1
1

When directly launched by an RM, flag that a process is operating without daemons - i.e., standalone. Provide an error string for the new socket_not_available error. Use errmgr.abort to exit when we cannot get a socket, and ensure that the slurmd module returns the proper exit status for slurm 2.0

This commit was SVN r21868.
Этот коммит содержится в:
Ralph Castain 2009-08-22 02:58:20 +00:00
родитель 7370235c3e
Коммит 509cc0553c
7 изменённых файлов: 39 добавлений и 16 удалений

Просмотреть файл

@ -59,6 +59,7 @@
static int rte_init(void);
static int rte_finalize(void);
static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__;
static uint8_t proc_get_locality(orte_process_name_t *proc);
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc);
@ -70,7 +71,7 @@ static int update_nidmap(opal_byte_object_t *bo);
orte_ess_base_module_t orte_ess_slurmd_module = {
rte_init,
rte_finalize,
orte_ess_base_app_abort,
rte_abort,
proc_get_locality,
proc_get_daemon,
proc_get_hostname,
@ -124,6 +125,9 @@ static int rte_init(void)
* provided that info in our environment, so get it from there
*/
/* declare ourselves to be standalone - i.e., not launched by orted */
orte_standalone_operation = true;
/* get the slurm jobid - this will be our job family */
envar = getenv("SLURM_JOBID");
/* don't need to check this for NULL - if it was, we would
@ -353,11 +357,6 @@ static int rte_init(void)
return ORTE_SUCCESS;
error:
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == ret && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
exit(108);
}
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
@ -384,6 +383,16 @@ static int rte_finalize(void)
return ret;
}
static void rte_abort(int error_code, bool report)
{
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
orte_ess_base_app_abort(108, false);
} else {
orte_ess_base_app_abort(error_code, report);
}
}
static uint8_t proc_get_locality(orte_process_name_t *proc)
{
orte_nid_t *nid;

Просмотреть файл

@ -546,7 +546,6 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
struct sockaddr_storage inaddr;
opal_socklen_t addrlen;
char **ports=NULL;
char *ctmp;
/* create a listen socket for incoming connections */
*target_sd = socket(af_family, SOCK_STREAM, 0);
@ -786,17 +785,15 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
}
/* if we reach this point, then no socket could be found in the specified
* range that was available to us, so report the error
*/
ctmp = opal_argv_join(ports, ',');
opal_output(0, "%s oob:tcp:bind() failed - no port available in specified list:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ctmp);
free(ctmp);
/* cleanup and return the error */
/* cleanup */
CLOSE_THE_SOCKET(*target_sd);
opal_argv_free(ports);
if (orte_standalone_operation) {
/* if we are running as a standalone app - i.e., one
* not launched by orteds - then abort
*/
orte_errmgr.abort(ORTE_ERR_SOCKET_NOT_AVAILABLE, NULL);
}
return ORTE_ERR_SOCKET_NOT_AVAILABLE;

Просмотреть файл

@ -60,6 +60,7 @@ bool orted_spin_flag = false;
/* ORTE OOB port flags */
bool orte_static_ports = false;
char *orte_oob_static_ports = NULL;
bool orte_standalone_operation = false;
bool orte_keep_fqdn_hostnames = false;
bool orte_show_resolved_nodenames;

Просмотреть файл

@ -491,6 +491,7 @@ ORTE_DECLSPEC extern bool orted_spin_flag;
/* ORTE OOB port flags */
ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern char *orte_oob_static_ports;
ORTE_DECLSPEC extern bool orte_standalone_operation;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;

Просмотреть файл

@ -252,6 +252,14 @@ while attempting to start process rank %lu.
Error name: %s
Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-socket-not-avail]
%s was unable to start the specified application as it encountered an error:
Error name: %s
Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-failed-to-start-no-status]

Просмотреть файл

@ -1004,6 +1004,10 @@ static void dump_aborted_procs(void)
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
(unsigned long)proc->name.vpid);
} else if (ORTE_ERR_SOCKET_NOT_AVAILABLE == proc->exit_code) {
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
(unsigned long)proc->name.vpid);
} else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orterun_basename, proc->node->name);

Просмотреть файл

@ -118,6 +118,9 @@ orte_err2str(int errnum)
case ORTE_ERR_SYS_LIMITS_SOCKETS:
retval = "The system limit on number of network connections a process can open was reached";
break;
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
retval = "Unable to open a TCP socket for out-of-band communications";
break;
default:
retval = NULL;
}