1
1

When directly launched by an RM, flag that a process is operating without daemons - i.e., standalone. Provide an error string for the new socket_not_available error. Use errmgr.abort to exit when we cannot get a socket, and ensure that the slurmd module returns the proper exit status for slurm 2.0

This commit was SVN r21868.
Этот коммит содержится в:
Ralph Castain 2009-08-22 02:58:20 +00:00
родитель 7370235c3e
Коммит 509cc0553c
7 изменённых файлов: 39 добавлений и 16 удалений

Просмотреть файл

@ -59,6 +59,7 @@
static int rte_init(void); static int rte_init(void);
static int rte_finalize(void); static int rte_finalize(void);
static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__;
static uint8_t proc_get_locality(orte_process_name_t *proc); static uint8_t proc_get_locality(orte_process_name_t *proc);
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); static orte_vpid_t proc_get_daemon(orte_process_name_t *proc);
static char* proc_get_hostname(orte_process_name_t *proc); static char* proc_get_hostname(orte_process_name_t *proc);
@ -70,7 +71,7 @@ static int update_nidmap(opal_byte_object_t *bo);
orte_ess_base_module_t orte_ess_slurmd_module = { orte_ess_base_module_t orte_ess_slurmd_module = {
rte_init, rte_init,
rte_finalize, rte_finalize,
orte_ess_base_app_abort, rte_abort,
proc_get_locality, proc_get_locality,
proc_get_daemon, proc_get_daemon,
proc_get_hostname, proc_get_hostname,
@ -124,6 +125,9 @@ static int rte_init(void)
* provided that info in our environment, so get it from there * provided that info in our environment, so get it from there
*/ */
/* declare ourselves to be standalone - i.e., not launched by orted */
orte_standalone_operation = true;
/* get the slurm jobid - this will be our job family */ /* get the slurm jobid - this will be our job family */
envar = getenv("SLURM_JOBID"); envar = getenv("SLURM_JOBID");
/* don't need to check this for NULL - if it was, we would /* don't need to check this for NULL - if it was, we would
@ -353,11 +357,6 @@ static int rte_init(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == ret && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
exit(108);
}
orte_show_help("help-orte-runtime.txt", orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure", "orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret); true, error, ORTE_ERROR_NAME(ret), ret);
@ -384,6 +383,16 @@ static int rte_finalize(void)
return ret; return ret;
} }
static void rte_abort(int error_code, bool report)
{
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
orte_ess_base_app_abort(108, false);
} else {
orte_ess_base_app_abort(error_code, report);
}
}
static uint8_t proc_get_locality(orte_process_name_t *proc) static uint8_t proc_get_locality(orte_process_name_t *proc)
{ {
orte_nid_t *nid; orte_nid_t *nid;

Просмотреть файл

@ -546,7 +546,6 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
struct sockaddr_storage inaddr; struct sockaddr_storage inaddr;
opal_socklen_t addrlen; opal_socklen_t addrlen;
char **ports=NULL; char **ports=NULL;
char *ctmp;
/* create a listen socket for incoming connections */ /* create a listen socket for incoming connections */
*target_sd = socket(af_family, SOCK_STREAM, 0); *target_sd = socket(af_family, SOCK_STREAM, 0);
@ -786,17 +785,15 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
} }
/* if we reach this point, then no socket could be found in the specified /* cleanup */
* range that was available to us, so report the error
*/
ctmp = opal_argv_join(ports, ',');
opal_output(0, "%s oob:tcp:bind() failed - no port available in specified list:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ctmp);
free(ctmp);
/* cleanup and return the error */
CLOSE_THE_SOCKET(*target_sd); CLOSE_THE_SOCKET(*target_sd);
opal_argv_free(ports); opal_argv_free(ports);
if (orte_standalone_operation) {
/* if we are running as a standalone app - i.e., one
* not launched by orteds - then abort
*/
orte_errmgr.abort(ORTE_ERR_SOCKET_NOT_AVAILABLE, NULL);
}
return ORTE_ERR_SOCKET_NOT_AVAILABLE; return ORTE_ERR_SOCKET_NOT_AVAILABLE;

Просмотреть файл

@ -60,6 +60,7 @@ bool orted_spin_flag = false;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
bool orte_static_ports = false; bool orte_static_ports = false;
char *orte_oob_static_ports = NULL; char *orte_oob_static_ports = NULL;
bool orte_standalone_operation = false;
bool orte_keep_fqdn_hostnames = false; bool orte_keep_fqdn_hostnames = false;
bool orte_show_resolved_nodenames; bool orte_show_resolved_nodenames;

Просмотреть файл

@ -491,6 +491,7 @@ ORTE_DECLSPEC extern bool orted_spin_flag;
/* ORTE OOB port flags */ /* ORTE OOB port flags */
ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern char *orte_oob_static_ports; ORTE_DECLSPEC extern char *orte_oob_static_ports;
ORTE_DECLSPEC extern bool orte_standalone_operation;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;

Просмотреть файл

@ -252,6 +252,14 @@ while attempting to start process rank %lu.
Error name: %s Error name: %s
Node: %s Node: %s
when attempting to start process rank %lu.
#
[orterun:proc-socket-not-avail]
%s was unable to start the specified application as it encountered an error:
Error name: %s
Node: %s
when attempting to start process rank %lu. when attempting to start process rank %lu.
# #
[orterun:proc-failed-to-start-no-status] [orterun:proc-failed-to-start-no-status]

Просмотреть файл

@ -1004,6 +1004,10 @@ static void dump_aborted_procs(void)
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true, orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name, orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
(unsigned long)proc->name.vpid); (unsigned long)proc->name.vpid);
} else if (ORTE_ERR_SOCKET_NOT_AVAILABLE == proc->exit_code) {
orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
orterun_basename, ORTE_ERROR_NAME(proc->exit_code), proc->node->name,
(unsigned long)proc->name.vpid);
} else { } else {
orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true, orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
orterun_basename, proc->node->name); orterun_basename, proc->node->name);

Просмотреть файл

@ -118,6 +118,9 @@ orte_err2str(int errnum)
case ORTE_ERR_SYS_LIMITS_SOCKETS: case ORTE_ERR_SYS_LIMITS_SOCKETS:
retval = "The system limit on number of network connections a process can open was reached"; retval = "The system limit on number of network connections a process can open was reached";
break; break;
case ORTE_ERR_SOCKET_NOT_AVAILABLE:
retval = "Unable to open a TCP socket for out-of-band communications";
break;
default: default:
retval = NULL; retval = NULL;
} }