From d4f56cff61f849d9d66eaddc907950479cfcccdd Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 6 May 2010 20:57:17 +0000 Subject: [PATCH] More cleanup on paffinity....groan It is okay to not have a paffinity module IF you aren't using paffinity anyway. So don't error out of MPI_Init because a paffinity module wasn't selected. Cleanup error reporting in the odls default module to (once and for all!) eliminate messages originating in the fork'd process. Create some new error codes to allow us to pass enough info back to the parent process to provide useful error messages. This commit was SVN r23106. --- opal/include/opal/constants.h | 63 +++--- .../paffinity/base/paffinity_base_select.c | 17 +- .../paffinity/base/paffinity_base_service.c | 72 ++---- .../paffinity/base/paffinity_base_wrappers.c | 20 +- opal/runtime/opal_init.c | 32 ++- orte/include/orte/constants.h | 79 ++++--- orte/mca/odls/default/help-odls-default.txt | 34 ++- orte/mca/odls/default/odls_default_module.c | 206 ++++++++---------- orte/util/error_strings.c | 3 - 9 files changed, 256 insertions(+), 270 deletions(-) diff --git a/opal/include/opal/constants.h b/opal/include/opal/constants.h index 6d85734a34..e136d43691 100644 --- a/opal/include/opal/constants.h +++ b/opal/include/opal/constants.h @@ -24,32 +24,32 @@ #define OPAL_ERR_BASE 0 /* internal use only */ enum { - OPAL_SUCCESS = (OPAL_ERR_BASE), + OPAL_SUCCESS = (OPAL_ERR_BASE), - OPAL_ERROR = (OPAL_ERR_BASE - 1), - OPAL_ERR_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 2), /* fatal error */ - OPAL_ERR_TEMP_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 3), /* try again later */ - OPAL_ERR_RESOURCE_BUSY = (OPAL_ERR_BASE - 4), - OPAL_ERR_BAD_PARAM = (OPAL_ERR_BASE - 5), /* equivalent to MPI_ERR_ARG error code */ - OPAL_ERR_FATAL = (OPAL_ERR_BASE - 6), - OPAL_ERR_NOT_IMPLEMENTED = (OPAL_ERR_BASE - 7), - OPAL_ERR_NOT_SUPPORTED = (OPAL_ERR_BASE - 8), - OPAL_ERR_INTERUPTED = (OPAL_ERR_BASE - 9), - OPAL_ERR_WOULD_BLOCK = (OPAL_ERR_BASE - 10), - OPAL_ERR_IN_ERRNO = (OPAL_ERR_BASE - 11), - OPAL_ERR_UNREACH = (OPAL_ERR_BASE - 12), - OPAL_ERR_NOT_FOUND = (OPAL_ERR_BASE - 13), - OPAL_EXISTS = (OPAL_ERR_BASE - 14), /* indicates that the specified object already exists */ - OPAL_ERR_TIMEOUT = (OPAL_ERR_BASE - 15), - OPAL_ERR_NOT_AVAILABLE = (OPAL_ERR_BASE - 16), - OPAL_ERR_PERM = (OPAL_ERR_BASE - 17), /* no permission */ - OPAL_ERR_VALUE_OUT_OF_BOUNDS = (OPAL_ERR_BASE - 18), - OPAL_ERR_FILE_READ_FAILURE = (OPAL_ERR_BASE - 19), - OPAL_ERR_FILE_WRITE_FAILURE = (OPAL_ERR_BASE - 20), - OPAL_ERR_FILE_OPEN_FAILURE = (OPAL_ERR_BASE - 21), - OPAL_ERR_PACK_MISMATCH = (OPAL_ERR_BASE - 22), - OPAL_ERR_PACK_FAILURE = (OPAL_ERR_BASE - 23), - OPAL_ERR_UNPACK_FAILURE = (OPAL_ERR_BASE - 24), + OPAL_ERROR = (OPAL_ERR_BASE - 1), + OPAL_ERR_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 2), /* fatal error */ + OPAL_ERR_TEMP_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 3), /* try again later */ + OPAL_ERR_RESOURCE_BUSY = (OPAL_ERR_BASE - 4), + OPAL_ERR_BAD_PARAM = (OPAL_ERR_BASE - 5), /* equivalent to MPI_ERR_ARG error code */ + OPAL_ERR_FATAL = (OPAL_ERR_BASE - 6), + OPAL_ERR_NOT_IMPLEMENTED = (OPAL_ERR_BASE - 7), + OPAL_ERR_NOT_SUPPORTED = (OPAL_ERR_BASE - 8), + OPAL_ERR_INTERUPTED = (OPAL_ERR_BASE - 9), + OPAL_ERR_WOULD_BLOCK = (OPAL_ERR_BASE - 10), + OPAL_ERR_IN_ERRNO = (OPAL_ERR_BASE - 11), + OPAL_ERR_UNREACH = (OPAL_ERR_BASE - 12), + OPAL_ERR_NOT_FOUND = (OPAL_ERR_BASE - 13), + OPAL_EXISTS = (OPAL_ERR_BASE - 14), /* indicates that the specified object already exists */ + OPAL_ERR_TIMEOUT = (OPAL_ERR_BASE - 15), + OPAL_ERR_NOT_AVAILABLE = (OPAL_ERR_BASE - 16), + OPAL_ERR_PERM = (OPAL_ERR_BASE - 17), /* no permission */ + OPAL_ERR_VALUE_OUT_OF_BOUNDS = (OPAL_ERR_BASE - 18), + OPAL_ERR_FILE_READ_FAILURE = (OPAL_ERR_BASE - 19), + OPAL_ERR_FILE_WRITE_FAILURE = (OPAL_ERR_BASE - 20), + OPAL_ERR_FILE_OPEN_FAILURE = (OPAL_ERR_BASE - 21), + OPAL_ERR_PACK_MISMATCH = (OPAL_ERR_BASE - 22), + OPAL_ERR_PACK_FAILURE = (OPAL_ERR_BASE - 23), + OPAL_ERR_UNPACK_FAILURE = (OPAL_ERR_BASE - 24), OPAL_ERR_UNPACK_INADEQUATE_SPACE = (OPAL_ERR_BASE - 25), OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER = (OPAL_ERR_BASE - 26), OPAL_ERR_TYPE_MISMATCH = (OPAL_ERR_BASE - 27), @@ -57,7 +57,18 @@ enum { OPAL_ERR_UNKNOWN_DATA_TYPE = (OPAL_ERR_BASE - 29), OPAL_ERR_BUFFER = (OPAL_ERR_BASE - 30), OPAL_ERR_DATA_TYPE_REDEF = (OPAL_ERR_BASE - 31), - OPAL_ERR_DATA_OVERWRITE_ATTEMPT = (OPAL_ERR_BASE - 32) + OPAL_ERR_DATA_OVERWRITE_ATTEMPT = (OPAL_ERR_BASE - 32), + OPAL_ERR_MODULE_NOT_FOUND = (OPAL_ERR_BASE - 33), + OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED = (OPAL_ERR_BASE - 34), + OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED = (OPAL_ERR_BASE - 35), + OPAL_ERR_TOPO_CORE_NOT_SUPPORTED = (OPAL_ERR_BASE - 36), + OPAL_ERR_NOT_ENOUGH_SOCKETS = (OPAL_ERR_BASE - 37), + OPAL_ERR_NOT_ENOUGH_CORES = (OPAL_ERR_BASE - 38), + OPAL_ERR_INVALID_PHYS_CPU = (OPAL_ERR_BASE - 39), + OPAL_ERR_MULTIPLE_AFFINITIES = (OPAL_ERR_BASE - 40), + OPAL_ERR_SLOT_LIST_RANGE = (OPAL_ERR_BASE - 41) + + }; #define OPAL_ERR_MAX (OPAL_ERR_BASE - 100) diff --git a/opal/mca/paffinity/base/paffinity_base_select.c b/opal/mca/paffinity/base/paffinity_base_select.c index 74656d300e..66eeb159a6 100644 --- a/opal/mca/paffinity/base/paffinity_base_select.c +++ b/opal/mca/paffinity/base/paffinity_base_select.c @@ -37,7 +37,7 @@ const opal_paffinity_base_module_1_1_0_t *opal_paffinity_base_module = NULL; int opal_paffinity_base_select(void) { - int ret, exit_status = OPAL_SUCCESS; + int ret = OPAL_SUCCESS; opal_paffinity_base_component_2_0_0_t *best_component = NULL; opal_paffinity_base_module_1_1_0_t *best_module = NULL; @@ -48,9 +48,10 @@ int opal_paffinity_base_select(void) &opal_paffinity_base_components_opened, (mca_base_module_t **) &best_module, (mca_base_component_t **) &best_component) ) { - /* This will only happen if no component was selected */ - exit_status = OPAL_ERR_NOT_FOUND; - goto cleanup; + /* It is okay if we don't find a module - we will report an + * error if/when someone tries to actually use affinity + */ + return OPAL_SUCCESS; } /* Save the winner */ @@ -60,12 +61,8 @@ int opal_paffinity_base_select(void) /* Initialize the winner */ if (NULL != opal_paffinity_base_module) { - if (OPAL_SUCCESS != (ret = opal_paffinity_base_module->paff_module_init()) ) { - exit_status = ret; - goto cleanup; - } + ret = opal_paffinity_base_module->paff_module_init(); } - cleanup: - return exit_status; + return ret; } diff --git a/opal/mca/paffinity/base/paffinity_base_service.c b/opal/mca/paffinity/base/paffinity_base_service.c index 93a17fc2f0..90429ff7ee 100644 --- a/opal/mca/paffinity/base/paffinity_base_service.c +++ b/opal/mca/paffinity/base/paffinity_base_service.c @@ -48,7 +48,7 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_ /* get the number of LOGICAL processors on this node */ if (OPAL_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) { - return OPAL_ERROR; + return rc; } OPAL_PAFFINITY_CPU_ZERO(*cpumask); for (i=0; i (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %ld", - rank, (long)processor_id); - return OPAL_ERROR; + return phys_processor; } OPAL_PAFFINITY_CPU_SET(phys_processor, *cpumask); /* output diagnostic if requested */ @@ -82,9 +80,7 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_ if (logical_map) { /* need to convert this to physical processor id */ if (0 > (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %ld", - rank, (long)processor_id); - return OPAL_ERROR; + return phys_processor; } } else { phys_processor = processor_id; @@ -107,18 +103,14 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_ upper_range = atoi(range[1]); if (num_processors < (upper_range - lower_range) || upper_range <= lower_range) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d #processors %d", - rank, lower_range, upper_range, num_processors); - return OPAL_ERROR; + return OPAL_ERR_SLOT_LIST_RANGE; } for (processor_id=lower_range; processor_id<=upper_range; processor_id++) { if (logical_map) { /* need to convert this to physical processor id */ if (0 > (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %d", - rank, processor_id); - return OPAL_ERROR; + return phys_processor; } } else { phys_processor = processor_id; @@ -174,9 +166,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i /* need to convert provided socket to a PHYSICAL socket id */ phys_socket = opal_paffinity_base_get_physical_socket_id(socket); if (0 > phys_socket) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical socket id for logical socket %ld", - rank, (long)socket); - return OPAL_ERROR; + return phys_socket; } } else { phys_socket = socket; @@ -185,8 +175,6 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i /* get the LOGICAL core info for this socket */ if ( OPAL_SUCCESS != ( rc = opal_paffinity_base_get_core_info(phys_socket, &num_cores))) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Could not get core info for physical socket number %d (%d)", - rank, phys_socket, socket); return rc; } @@ -195,9 +183,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i for (core = 0; core < num_cores; core++) { /* convert to PHYSICAL core id */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } /* get the PHYSICAL processor id for the PHYSICAL socket/core */ if ( OPAL_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id (phys_socket, phys_core, &phys_processor))) { @@ -224,9 +210,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i if (logical_map) { /* convert to physical core */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; @@ -252,9 +236,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i lower_range = atoi(range[0]); upper_range = atoi(range[1]); if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d", - rank, lower_range, upper_range, num_cores); - return OPAL_ERROR; + return OPAL_ERR_SLOT_LIST_RANGE; } for (core=lower_range; core<=upper_range; core++) { if (logical_map) { @@ -262,7 +244,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; @@ -310,9 +292,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i if (logical_map) { /* convert to physical core */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; @@ -338,17 +318,13 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i lower_range = atoi(range[0]); upper_range = atoi(range[1]); if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d", - rank, lower_range, upper_range, num_cores); - return OPAL_ERROR; + return OPAL_ERR_SLOT_LIST_RANGE; } for (core=lower_range; core<=upper_range; core++) { if (logical_map) { /* convert to physical core */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; @@ -385,9 +361,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i /* need to convert provided socket to a PHYSICAL socket id */ phys_socket = opal_paffinity_base_get_physical_socket_id(socket); if (0 > phys_socket) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical socket id for logical socket %ld", - rank, (long)socket); - return OPAL_ERROR; + return phys_socket; } } else { phys_socket = socket; @@ -395,8 +369,6 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i /* get the LOGICAL core info for this socket */ if ( OPAL_SUCCESS != ( rc = opal_paffinity_base_get_core_info(phys_socket, &num_cores))) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Could not get core info for physical socket number %d (%d)", - rank, phys_socket, socket); return rc; } @@ -405,9 +377,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i for (core = 0; core < num_cores; core++) { /* convert to PHYSICAL core id */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } /* get the PHYSICAL processor id for the PHYSICAL socket/core */ if ( OPAL_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id (phys_socket, phys_core, &phys_processor))) { @@ -435,9 +405,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i if (logical_map) { /* convert to physical core */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; @@ -463,17 +431,13 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i lower_range = atoi(range[0]); upper_range = atoi(range[1]); if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) { - opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d", - rank, lower_range, upper_range, num_cores); - return OPAL_ERROR; + return OPAL_ERR_SLOT_LIST_RANGE; } for (core=lower_range; core<=upper_range; core++) { if (logical_map) { /* convert to physical core */ if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) { - opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)", - rank, (long)core, (long)phys_socket, (long)socket); - return OPAL_ERROR; + return phys_core; } } else { phys_core = core; diff --git a/opal/mca/paffinity/base/paffinity_base_wrappers.c b/opal/mca/paffinity/base/paffinity_base_wrappers.c index cc23066bc8..c89e588abe 100644 --- a/opal/mca/paffinity/base/paffinity_base_wrappers.c +++ b/opal/mca/paffinity/base/paffinity_base_wrappers.c @@ -36,7 +36,7 @@ int opal_paffinity_base_set(opal_paffinity_base_cpu_set_t cpumask) int rc; if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } if (OPAL_SUCCESS == (rc = opal_paffinity_base_module->paff_module_set(cpumask))) { opal_paffinity_base_bound = true; @@ -54,7 +54,7 @@ int opal_paffinity_base_get(opal_paffinity_base_cpu_set_t *cpumask) } if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } if(NULL == cpumask) { return OPAL_ERR_BAD_PARAM; @@ -65,7 +65,7 @@ int opal_paffinity_base_get(opal_paffinity_base_cpu_set_t *cpumask) int opal_paffinity_base_get_map_to_processor_id(int socket, int core, int *processor_id) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_map_to_processor_id(socket, core, processor_id); } @@ -73,7 +73,7 @@ int opal_paffinity_base_get_map_to_processor_id(int socket, int core, int *proce int opal_paffinity_base_get_map_to_socket_core(int processor_id, int *socket, int *core) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_map_to_socket_core(processor_id, socket, core); } @@ -82,7 +82,7 @@ int opal_paffinity_base_get_map_to_socket_core(int processor_id, int *socket, in int opal_paffinity_base_get_processor_info(int *num_processors) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_processor_info(num_processors); } @@ -90,7 +90,7 @@ int opal_paffinity_base_get_processor_info(int *num_processors) int opal_paffinity_base_get_socket_info(int *num_sockets) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_socket_info(num_sockets); } @@ -98,7 +98,7 @@ int opal_paffinity_base_get_socket_info(int *num_sockets) int opal_paffinity_base_get_core_info(int socket, int *num_cores) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_core_info(socket, num_cores); } @@ -106,7 +106,7 @@ int opal_paffinity_base_get_core_info(int socket, int *num_cores) int opal_paffinity_base_get_physical_processor_id(int logical_processor_id) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_physical_processor_id(logical_processor_id); } @@ -114,7 +114,7 @@ int opal_paffinity_base_get_physical_processor_id(int logical_processor_id) int opal_paffinity_base_get_physical_socket_id(int logical_socket_id) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_physical_socket_id(logical_socket_id); } @@ -122,7 +122,7 @@ int opal_paffinity_base_get_physical_socket_id(int logical_socket_id) int opal_paffinity_base_get_physical_core_id(int physical_socket_id, int logical_core_id) { if (!opal_paffinity_base_selected) { - return OPAL_ERR_NOT_FOUND; + return OPAL_ERR_MODULE_NOT_FOUND; } return opal_paffinity_base_module->paff_get_physical_core_id(physical_socket_id, logical_core_id); } diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index 8ef9c2ca69..dd88df613b 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -168,11 +168,39 @@ opal_err2str(int errnum) case OPAL_ERR_DATA_OVERWRITE_ATTEMPT: retval = "Attempt to overwrite a data value"; break; + case OPAL_ERR_MODULE_NOT_FOUND: + retval = "Framework requires at least one active module, but none found"; + break; + case OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: + retval = "OS topology does not support slot_list process affinity"; + break; + case OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED: + retval = "Could not obtain socket topology information"; + break; + case OPAL_ERR_TOPO_CORE_NOT_SUPPORTED: + retval = "Could not obtain core topology information"; + break; + case OPAL_ERR_NOT_ENOUGH_SOCKETS: + retval = "Not enough sockets to meet request"; + break; + case OPAL_ERR_NOT_ENOUGH_CORES: + retval = "Not enough cores to meet request"; + break; + case OPAL_ERR_INVALID_PHYS_CPU: + retval = "Invalid physical cpu number returned"; + break; + case OPAL_ERR_MULTIPLE_AFFINITIES: + retval = "Multiple methods for assigning process affinity were specified"; + break; + case OPAL_ERR_SLOT_LIST_RANGE: + retval = "Provided slot_list range is invalid"; + break; + default: retval = NULL; - } +} - return retval; +return retval; } diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 77fbc4ce37..ed1cfaeda1 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -31,40 +31,50 @@ enum { /* Error codes inherited from OPAL. Still enum values so that we get the nice debugger help. */ - ORTE_SUCCESS = OPAL_SUCCESS, + ORTE_SUCCESS = OPAL_SUCCESS, - ORTE_ERROR = OPAL_ERROR, - ORTE_ERR_OUT_OF_RESOURCE = OPAL_ERR_OUT_OF_RESOURCE, - ORTE_ERR_TEMP_OUT_OF_RESOURCE = OPAL_ERR_TEMP_OUT_OF_RESOURCE, - ORTE_ERR_RESOURCE_BUSY = OPAL_ERR_RESOURCE_BUSY, - ORTE_ERR_BAD_PARAM = OPAL_ERR_BAD_PARAM, - ORTE_ERR_FATAL = OPAL_ERR_FATAL, - ORTE_ERR_NOT_IMPLEMENTED = OPAL_ERR_NOT_IMPLEMENTED, - ORTE_ERR_NOT_SUPPORTED = OPAL_ERR_NOT_SUPPORTED, - ORTE_ERR_INTERUPTED = OPAL_ERR_INTERUPTED, - ORTE_ERR_WOULD_BLOCK = OPAL_ERR_WOULD_BLOCK, - ORTE_ERR_IN_ERRNO = OPAL_ERR_IN_ERRNO, - ORTE_ERR_UNREACH = OPAL_ERR_UNREACH, - ORTE_ERR_NOT_FOUND = OPAL_ERR_NOT_FOUND, - ORTE_EXISTS = OPAL_EXISTS, - ORTE_ERR_TIMEOUT = OPAL_ERR_TIMEOUT, - ORTE_ERR_NOT_AVAILABLE = OPAL_ERR_NOT_AVAILABLE, - ORTE_ERR_PERM = OPAL_ERR_PERM, - ORTE_ERR_VALUE_OUT_OF_BOUNDS = OPAL_ERR_VALUE_OUT_OF_BOUNDS, - ORTE_ERR_FILE_READ_FAILURE = OPAL_ERR_FILE_READ_FAILURE, - ORTE_ERR_FILE_WRITE_FAILURE = OPAL_ERR_FILE_WRITE_FAILURE, - ORTE_ERR_FILE_OPEN_FAILURE = OPAL_ERR_FILE_OPEN_FAILURE, - ORTE_ERR_PACK_MISMATCH = OPAL_ERR_PACK_MISMATCH, - ORTE_ERR_PACK_FAILURE = OPAL_ERR_PACK_FAILURE, - ORTE_ERR_UNPACK_FAILURE = OPAL_ERR_UNPACK_FAILURE, - ORTE_ERR_UNPACK_INADEQUATE_SPACE = OPAL_ERR_UNPACK_INADEQUATE_SPACE, + ORTE_ERROR = OPAL_ERROR, + ORTE_ERR_OUT_OF_RESOURCE = OPAL_ERR_OUT_OF_RESOURCE, + ORTE_ERR_TEMP_OUT_OF_RESOURCE = OPAL_ERR_TEMP_OUT_OF_RESOURCE, + ORTE_ERR_RESOURCE_BUSY = OPAL_ERR_RESOURCE_BUSY, + ORTE_ERR_BAD_PARAM = OPAL_ERR_BAD_PARAM, + ORTE_ERR_FATAL = OPAL_ERR_FATAL, + ORTE_ERR_NOT_IMPLEMENTED = OPAL_ERR_NOT_IMPLEMENTED, + ORTE_ERR_NOT_SUPPORTED = OPAL_ERR_NOT_SUPPORTED, + ORTE_ERR_INTERUPTED = OPAL_ERR_INTERUPTED, + ORTE_ERR_WOULD_BLOCK = OPAL_ERR_WOULD_BLOCK, + ORTE_ERR_IN_ERRNO = OPAL_ERR_IN_ERRNO, + ORTE_ERR_UNREACH = OPAL_ERR_UNREACH, + ORTE_ERR_NOT_FOUND = OPAL_ERR_NOT_FOUND, + ORTE_EXISTS = OPAL_EXISTS, + ORTE_ERR_TIMEOUT = OPAL_ERR_TIMEOUT, + ORTE_ERR_NOT_AVAILABLE = OPAL_ERR_NOT_AVAILABLE, + ORTE_ERR_PERM = OPAL_ERR_PERM, + ORTE_ERR_VALUE_OUT_OF_BOUNDS = OPAL_ERR_VALUE_OUT_OF_BOUNDS, + ORTE_ERR_FILE_READ_FAILURE = OPAL_ERR_FILE_READ_FAILURE, + ORTE_ERR_FILE_WRITE_FAILURE = OPAL_ERR_FILE_WRITE_FAILURE, + ORTE_ERR_FILE_OPEN_FAILURE = OPAL_ERR_FILE_OPEN_FAILURE, + ORTE_ERR_PACK_MISMATCH = OPAL_ERR_PACK_MISMATCH, + ORTE_ERR_PACK_FAILURE = OPAL_ERR_PACK_FAILURE, + ORTE_ERR_UNPACK_FAILURE = OPAL_ERR_UNPACK_FAILURE, + ORTE_ERR_UNPACK_INADEQUATE_SPACE = OPAL_ERR_UNPACK_INADEQUATE_SPACE, ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER = OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER, - ORTE_ERR_TYPE_MISMATCH = OPAL_ERR_TYPE_MISMATCH, - ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED, - ORTE_ERR_UNKNOWN_DATA_TYPE = OPAL_ERR_UNKNOWN_DATA_TYPE, - ORTE_ERR_BUFFER = OPAL_ERR_BUFFER, - ORTE_ERR_DATA_TYPE_REDEF = OPAL_ERR_DATA_TYPE_REDEF, - ORTE_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT, + ORTE_ERR_TYPE_MISMATCH = OPAL_ERR_TYPE_MISMATCH, + ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED, + ORTE_ERR_UNKNOWN_DATA_TYPE = OPAL_ERR_UNKNOWN_DATA_TYPE, + ORTE_ERR_BUFFER = OPAL_ERR_BUFFER, + ORTE_ERR_DATA_TYPE_REDEF = OPAL_ERR_DATA_TYPE_REDEF, + ORTE_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT, + ORTE_ERR_MODULE_NOT_FOUND = OPAL_ERR_MODULE_NOT_FOUND, + ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED = OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED, + ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED = OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED, + ORTE_ERR_TOPO_CORE_NOT_SUPPORTED = OPAL_ERR_TOPO_CORE_NOT_SUPPORTED, + ORTE_ERR_NOT_ENOUGH_SOCKETS = OPAL_ERR_NOT_ENOUGH_SOCKETS, + ORTE_ERR_NOT_ENOUGH_CORES = OPAL_ERR_NOT_ENOUGH_CORES, + ORTE_ERR_INVALID_PHYS_CPU = OPAL_ERR_INVALID_PHYS_CPU, + ORTE_ERR_MULTIPLE_AFFINITIES = OPAL_ERR_MULTIPLE_AFFINITIES, + ORTE_ERR_SLOT_LIST_RANGE = OPAL_ERR_SLOT_LIST_RANGE, + /* error codes specific to ORTE - don't forget to update orte/util/error_strings.c when adding new error codes!! Otherwise, the error reporting system will potentially crash, @@ -101,8 +111,9 @@ enum { ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29), ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30), ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31), - ORTE_ERR_MODULE_NOT_FOUND = (ORTE_ERR_BASE - 32), - ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 33) + ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), + ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33), + ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/mca/odls/default/help-odls-default.txt b/orte/mca/odls/default/help-odls-default.txt index 500724229f..aed60f6375 100644 --- a/orte/mca/odls/default/help-odls-default.txt +++ b/orte/mca/odls/default/help-odls-default.txt @@ -46,16 +46,6 @@ This could mean that your PATH or executable name is wrong, or that you do not have the necessary permissions. Please ensure that the executable is able to be found and executed. -# -[nodeid-out-of-range] -The id of a node is out of the allowed range. - -Value given: %ld -Max value allowed: %ld - -This may be resolved by increasing the number of available node id's by -re-configuring Open MPI with the --enable-jumbo-clusters option, and then -re-running the application # [odls-default:multiple-paffinity-schemes] Multiple processor affinity schemes were specified (can only specify one): @@ -130,3 +120,27 @@ binding action: Application name: %s Please revise the request and try again. +# +[odls-default:paffinity-missing-module] +A request to bind processes was made, but no paffinity module +was found: + + Local host: %s + +This is potentially a configuration. You can rerun your job without +requesting binding, or check the configuration. +# +[odls-default:invalid-slot-list-range] +A slot list was provided that exceeds the boundaries on available +resources: + + Local host: %s + Slot list: %s + +Please check your boundaries and try again. +# +[odls-default:affinity-not-supported] +A request was made to bind processes, but process affinity is +not supported on this node: + + Local host: %s diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 431fb9a54e..7c5b134397 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -294,7 +294,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, child->pid = pid; } - if(pid < 0) { + if (pid < 0) { ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); if (NULL != child) { child->state = ORTE_PROC_STATE_FAILED_TO_START; @@ -344,10 +344,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), child->slot_list, ORTE_NAME_PRINT(child->name))); if (opal_paffinity_alone) { - /* It's an error if multiple paffinity schemes were specified */ - orte_show_help("help-odls-default.txt", - "odls-default:multiple-paffinity-schemes", true, child->slot_list); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_MULTIPLE_AFFINITIES); } if (orte_report_bindings) { opal_output(0, "%s odls:default:fork binding child %s to slot_list %s", @@ -355,17 +352,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, ORTE_NAME_PRINT(child->name), child->slot_list); } if (ORTE_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list, &mask))) { - if (ORTE_ERR_NOT_SUPPORTED == rc) { - /* OS doesn't support providing topology information */ - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "rankfile containing a slot_list of ", - child->slot_list, context->app); - ORTE_ODLS_ERROR_OUT(rc); - } - - orte_show_help("help-odls-default.txt", - "odls-default:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc)); ORTE_ODLS_ERROR_OUT(rc); } /* if we didn't wind up bound, then generate a warning unless suppressed */ @@ -381,15 +367,11 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, (int)jobdat->cpus_per_rank, (int)jobdat->stride)); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) { - orte_show_help("help-odls-default.txt", - "odls-default:invalid-node-rank", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_NODE_RANK); } /* get the local rank */ if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) { - orte_show_help("help-odls-default.txt", - "odls-default:invalid-local-rank", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_LOCAL_RANK); } /* init the mask */ OPAL_PAFFINITY_CPU_ZERO(mask); @@ -417,22 +399,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* if we don't have enough sockets, that is an error */ if (n < logical_skt) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "sockets", orte_process_info.nodename, - "bind-to-core", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS); } } else { target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS doesn't support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-core", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, @@ -453,14 +427,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, * from when we initialized */ target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS does not support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-core", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, "bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank, @@ -477,18 +447,12 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, phys_core = opal_paffinity_base_get_physical_core_id(target_socket, logical_cpu); if (0 > phys_core) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:invalid-phys-cpu", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(phys_core); } /* map this to a physical cpu on this node */ - if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) { + if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "processors", orte_process_info.nodename, - "bind-to-core", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(rc); } /* are we bound? */ if (orte_odls_globals.bound) { @@ -532,11 +496,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* if we don't have enough processors, that is an error */ if (ncpu <= logical_cpu) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "processors", orte_process_info.nodename, - "bind-to-core", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_CORES); } } else { /* if we are not bound, then all processors are available @@ -544,19 +504,9 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, * physical cpu */ phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu); - if (OPAL_ERROR == phys_cpu){ - /* No processor to bind to so error out */ + if (0 > phys_cpu) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "processors", orte_process_info.nodename, - "bind-to-core", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); - } else if (0 > phys_cpu) { - ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:invalid-phys-cpu", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(phys_cpu); } } OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); @@ -573,8 +523,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, } if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) { ORTE_ODLS_IF_BIND_NOT_REQD(5); - orte_show_help("help-odls-default.txt", - "odls-default:failed-set-paff", true); ORTE_ODLS_ERROR_OUT(rc); } paffinity_enabled = true; @@ -590,9 +538,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, * the provided mapping policy */ if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) { - orte_show_help("help-odls-default.txt", - "odls-default:invalid-local-rank", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_LOCAL_RANK); } if (ORTE_MAPPING_NPERXXX & jobdat->policy) { /* we need to balance the children from this job across the available sockets */ @@ -618,22 +564,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* if we don't have enough sockets, that is an error */ if (n < logical_skt) { ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "sockets", orte_process_info.nodename, - "bind-to-socket", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS); } } else { target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS doesn't support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-socket", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, @@ -650,14 +588,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, * from when we initialized */ target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS does not support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-socket", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, "bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank, @@ -687,36 +621,24 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, /* if we don't have enough processors, that is an error */ if (ncpu < logical_cpu) { ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:not-enough-resources", true, - "processors", orte_process_info.nodename, - "bind-to-socket", context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS); } /* get the physical socket of that cpu */ - if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core)) { + if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core))) { if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) { goto LAUNCH_PROCS; } - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-socket", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(rc); } } else { /* if we are not bound, then just use all sockets */ if (1 == orte_odls_globals.num_sockets) { /* if we only have one socket, then just put it there */ target_socket = opal_paffinity_base_get_physical_socket_id(0); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS doesn't support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-socket", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } } else { /* compute the logical socket, compensating for the number of cpus_per_rank */ @@ -725,14 +647,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, logical_skt = logical_skt % orte_odls_globals.num_sockets; /* now get the target physical socket */ target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); - if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + if (target_socket < 0) { /* OS doesn't support providing topology information */ ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:topo-not-supported", - true, orte_process_info.nodename, "bind-to-socket", "", - context->app); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(target_socket); } } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, @@ -745,18 +663,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, for (n=0; n < orte_default_num_cores_per_socket; n++) { /* get the physical core within this target socket */ phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n); - if (0 > phys_core) { + if (phys_core < 0) { ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:invalid-phys-cpu", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(phys_core); } /* map this to a physical cpu on this node */ - if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) { + if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) { ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:invalid-phys-cpu", true); - ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + ORTE_ODLS_ERROR_OUT(rc); } /* are we bound? */ if (orte_odls_globals.bound) { @@ -783,8 +697,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context, } if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) { ORTE_ODLS_IF_BIND_NOT_REQD(6); - orte_show_help("help-odls-default.txt", - "odls-default:failed-set-paff", true); ORTE_ODLS_ERROR_OUT(rc); } paffinity_enabled = true; @@ -932,12 +844,64 @@ LAUNCH_PROCS: know about the failure. The actual exit status of child proc cannot be found here - all we can do is report the ORTE error code that was reported back to us. The calling func needs to report the - failure to launch this process through the SMR or else + failure to launch this process through the errmgr or else everyone else will hang. */ if (NULL != child) { child->state = ORTE_PROC_STATE_FAILED_TO_START; child->exit_code = i; + if (ORTE_ERR_MULTIPLE_AFFINITIES == i) { + /* It's an error if multiple paffinity schemes were specified */ + orte_show_help("help-odls-default.txt", + "odls-default:multiple-paffinity-schemes", true, child->slot_list); + } else if (ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED == i) { + /* OS doesn't support providing topology information */ + orte_show_help("help-odls-default.txt", + "odls-default:topo-not-supported", + true, orte_process_info.nodename, "rankfile containing a slot_list of ", + child->slot_list, context->app); + } else if (ORTE_ERR_INVALID_NODE_RANK == i) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-node-rank", true); + } else if (ORTE_ERR_INVALID_LOCAL_RANK == i) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-local-rank", true); + } else if (ORTE_ERR_NOT_ENOUGH_CORES == i) { + orte_show_help("help-odls-default.txt", + "odls-default:not-enough-resources", true, + "sockets", orte_process_info.nodename, + "bind-to-core", context->app); + } else if (ORTE_ERR_TOPO_CORE_NOT_SUPPORTED == i) { + orte_show_help("help-odls-default.txt", + "odls-default:topo-not-supported", + true, orte_process_info.nodename, "bind-to-core", "", + context->app); + } else if (ORTE_ERR_INVALID_PHYS_CPU == i) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-phys-cpu", true); + } else if (ORTE_ERR_NOT_ENOUGH_SOCKETS == i) { + orte_show_help("help-odls-default.txt", + "odls-default:not-enough-resources", true, + "sockets", orte_process_info.nodename, + "bind-to-socket", context->app); + } else if (ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED == i) { + orte_show_help("help-odls-default.txt", + "odls-default:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + } else if (ORTE_ERR_MODULE_NOT_FOUND == i) { + orte_show_help("help-odls-default.txt", + "odls-default:paffinity-missing-module", + true, orte_process_info.nodename); + } else if (ORTE_ERR_SLOT_LIST_RANGE == i) { + orte_show_help("help-odls-default.txt", + "odls-default:invalid-slot-list-range", + true, orte_process_info.nodename, child->slot_list); + } else if (ORTE_ERR_NOT_SUPPORTED == i) { + orte_show_help("help-odls-default.txt", + "odls-default:affinity-not-supported", + true, orte_process_info.nodename); + } } OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index 1572a21acb..f5d7ed3c3e 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -126,9 +126,6 @@ const char *orte_err2str(int errnum) case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP: retval = "System will determine resources during bootstrap of daemons"; break; - case ORTE_ERR_MODULE_NOT_FOUND: - retval = "Framework requires at least one active module, but none found"; - break; case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED: retval = "Limit on number of process relocations was exceeded"; break;