1
1

More cleanup on paffinity....groan

It is okay to not have a paffinity module IF you aren't using paffinity anyway. So don't error out of MPI_Init because a paffinity module wasn't selected.

Cleanup error reporting in the odls default module to (once and for all!) eliminate messages originating in the fork'd process. Create some new error codes to allow us to pass enough info back to the parent process to provide useful error messages.

This commit was SVN r23106.
Этот коммит содержится в:
Ralph Castain 2010-05-06 20:57:17 +00:00
родитель 477201e161
Коммит d4f56cff61
9 изменённых файлов: 256 добавлений и 270 удалений

Просмотреть файл

@ -24,32 +24,32 @@
#define OPAL_ERR_BASE 0 /* internal use only */
enum {
OPAL_SUCCESS = (OPAL_ERR_BASE),
OPAL_SUCCESS = (OPAL_ERR_BASE),
OPAL_ERROR = (OPAL_ERR_BASE - 1),
OPAL_ERR_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 2), /* fatal error */
OPAL_ERR_TEMP_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 3), /* try again later */
OPAL_ERR_RESOURCE_BUSY = (OPAL_ERR_BASE - 4),
OPAL_ERR_BAD_PARAM = (OPAL_ERR_BASE - 5), /* equivalent to MPI_ERR_ARG error code */
OPAL_ERR_FATAL = (OPAL_ERR_BASE - 6),
OPAL_ERR_NOT_IMPLEMENTED = (OPAL_ERR_BASE - 7),
OPAL_ERR_NOT_SUPPORTED = (OPAL_ERR_BASE - 8),
OPAL_ERR_INTERUPTED = (OPAL_ERR_BASE - 9),
OPAL_ERR_WOULD_BLOCK = (OPAL_ERR_BASE - 10),
OPAL_ERR_IN_ERRNO = (OPAL_ERR_BASE - 11),
OPAL_ERR_UNREACH = (OPAL_ERR_BASE - 12),
OPAL_ERR_NOT_FOUND = (OPAL_ERR_BASE - 13),
OPAL_EXISTS = (OPAL_ERR_BASE - 14), /* indicates that the specified object already exists */
OPAL_ERR_TIMEOUT = (OPAL_ERR_BASE - 15),
OPAL_ERR_NOT_AVAILABLE = (OPAL_ERR_BASE - 16),
OPAL_ERR_PERM = (OPAL_ERR_BASE - 17), /* no permission */
OPAL_ERR_VALUE_OUT_OF_BOUNDS = (OPAL_ERR_BASE - 18),
OPAL_ERR_FILE_READ_FAILURE = (OPAL_ERR_BASE - 19),
OPAL_ERR_FILE_WRITE_FAILURE = (OPAL_ERR_BASE - 20),
OPAL_ERR_FILE_OPEN_FAILURE = (OPAL_ERR_BASE - 21),
OPAL_ERR_PACK_MISMATCH = (OPAL_ERR_BASE - 22),
OPAL_ERR_PACK_FAILURE = (OPAL_ERR_BASE - 23),
OPAL_ERR_UNPACK_FAILURE = (OPAL_ERR_BASE - 24),
OPAL_ERROR = (OPAL_ERR_BASE - 1),
OPAL_ERR_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 2), /* fatal error */
OPAL_ERR_TEMP_OUT_OF_RESOURCE = (OPAL_ERR_BASE - 3), /* try again later */
OPAL_ERR_RESOURCE_BUSY = (OPAL_ERR_BASE - 4),
OPAL_ERR_BAD_PARAM = (OPAL_ERR_BASE - 5), /* equivalent to MPI_ERR_ARG error code */
OPAL_ERR_FATAL = (OPAL_ERR_BASE - 6),
OPAL_ERR_NOT_IMPLEMENTED = (OPAL_ERR_BASE - 7),
OPAL_ERR_NOT_SUPPORTED = (OPAL_ERR_BASE - 8),
OPAL_ERR_INTERUPTED = (OPAL_ERR_BASE - 9),
OPAL_ERR_WOULD_BLOCK = (OPAL_ERR_BASE - 10),
OPAL_ERR_IN_ERRNO = (OPAL_ERR_BASE - 11),
OPAL_ERR_UNREACH = (OPAL_ERR_BASE - 12),
OPAL_ERR_NOT_FOUND = (OPAL_ERR_BASE - 13),
OPAL_EXISTS = (OPAL_ERR_BASE - 14), /* indicates that the specified object already exists */
OPAL_ERR_TIMEOUT = (OPAL_ERR_BASE - 15),
OPAL_ERR_NOT_AVAILABLE = (OPAL_ERR_BASE - 16),
OPAL_ERR_PERM = (OPAL_ERR_BASE - 17), /* no permission */
OPAL_ERR_VALUE_OUT_OF_BOUNDS = (OPAL_ERR_BASE - 18),
OPAL_ERR_FILE_READ_FAILURE = (OPAL_ERR_BASE - 19),
OPAL_ERR_FILE_WRITE_FAILURE = (OPAL_ERR_BASE - 20),
OPAL_ERR_FILE_OPEN_FAILURE = (OPAL_ERR_BASE - 21),
OPAL_ERR_PACK_MISMATCH = (OPAL_ERR_BASE - 22),
OPAL_ERR_PACK_FAILURE = (OPAL_ERR_BASE - 23),
OPAL_ERR_UNPACK_FAILURE = (OPAL_ERR_BASE - 24),
OPAL_ERR_UNPACK_INADEQUATE_SPACE = (OPAL_ERR_BASE - 25),
OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER = (OPAL_ERR_BASE - 26),
OPAL_ERR_TYPE_MISMATCH = (OPAL_ERR_BASE - 27),
@ -57,7 +57,18 @@ enum {
OPAL_ERR_UNKNOWN_DATA_TYPE = (OPAL_ERR_BASE - 29),
OPAL_ERR_BUFFER = (OPAL_ERR_BASE - 30),
OPAL_ERR_DATA_TYPE_REDEF = (OPAL_ERR_BASE - 31),
OPAL_ERR_DATA_OVERWRITE_ATTEMPT = (OPAL_ERR_BASE - 32)
OPAL_ERR_DATA_OVERWRITE_ATTEMPT = (OPAL_ERR_BASE - 32),
OPAL_ERR_MODULE_NOT_FOUND = (OPAL_ERR_BASE - 33),
OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED = (OPAL_ERR_BASE - 34),
OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED = (OPAL_ERR_BASE - 35),
OPAL_ERR_TOPO_CORE_NOT_SUPPORTED = (OPAL_ERR_BASE - 36),
OPAL_ERR_NOT_ENOUGH_SOCKETS = (OPAL_ERR_BASE - 37),
OPAL_ERR_NOT_ENOUGH_CORES = (OPAL_ERR_BASE - 38),
OPAL_ERR_INVALID_PHYS_CPU = (OPAL_ERR_BASE - 39),
OPAL_ERR_MULTIPLE_AFFINITIES = (OPAL_ERR_BASE - 40),
OPAL_ERR_SLOT_LIST_RANGE = (OPAL_ERR_BASE - 41)
};
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

Просмотреть файл

@ -37,7 +37,7 @@ const opal_paffinity_base_module_1_1_0_t *opal_paffinity_base_module = NULL;
int opal_paffinity_base_select(void)
{
int ret, exit_status = OPAL_SUCCESS;
int ret = OPAL_SUCCESS;
opal_paffinity_base_component_2_0_0_t *best_component = NULL;
opal_paffinity_base_module_1_1_0_t *best_module = NULL;
@ -48,9 +48,10 @@ int opal_paffinity_base_select(void)
&opal_paffinity_base_components_opened,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component) ) {
/* This will only happen if no component was selected */
exit_status = OPAL_ERR_NOT_FOUND;
goto cleanup;
/* It is okay if we don't find a module - we will report an
* error if/when someone tries to actually use affinity
*/
return OPAL_SUCCESS;
}
/* Save the winner */
@ -60,12 +61,8 @@ int opal_paffinity_base_select(void)
/* Initialize the winner */
if (NULL != opal_paffinity_base_module) {
if (OPAL_SUCCESS != (ret = opal_paffinity_base_module->paff_module_init()) ) {
exit_status = ret;
goto cleanup;
}
ret = opal_paffinity_base_module->paff_module_init();
}
cleanup:
return exit_status;
return ret;
}

Просмотреть файл

@ -48,7 +48,7 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_
/* get the number of LOGICAL processors on this node */
if (OPAL_SUCCESS != (rc = opal_paffinity_base_get_processor_info(&num_processors))) {
return OPAL_ERROR;
return rc;
}
OPAL_PAFFINITY_CPU_ZERO(*cpumask);
for (i=0; i<socket_cnt; i++) {
@ -56,9 +56,7 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_
/* bind to all available logical processors - first set the bits in the cpu mask */
for ( processor_id=0; processor_id<=num_processors; processor_id++) {
if (0 > (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %ld",
rank, (long)processor_id);
return OPAL_ERROR;
return phys_processor;
}
OPAL_PAFFINITY_CPU_SET(phys_processor, *cpumask);
/* output diagnostic if requested */
@ -82,9 +80,7 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_
if (logical_map) {
/* need to convert this to physical processor id */
if (0 > (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %ld",
rank, (long)processor_id);
return OPAL_ERROR;
return phys_processor;
}
} else {
phys_processor = processor_id;
@ -107,18 +103,14 @@ static int opal_paffinity_base_socket_to_cpu_set(char **socket_list, int socket_
upper_range = atoi(range[1]);
if (num_processors < (upper_range - lower_range) ||
upper_range <= lower_range) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d #processors %d",
rank, lower_range, upper_range, num_processors);
return OPAL_ERROR;
return OPAL_ERR_SLOT_LIST_RANGE;
}
for (processor_id=lower_range; processor_id<=upper_range; processor_id++) {
if (logical_map) {
/* need to convert this to physical processor id */
if (0 > (phys_processor = opal_paffinity_base_get_physical_processor_id(processor_id))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical processor id for logical processor %d",
rank, processor_id);
return OPAL_ERROR;
return phys_processor;
}
} else {
phys_processor = processor_id;
@ -174,9 +166,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
/* need to convert provided socket to a PHYSICAL socket id */
phys_socket = opal_paffinity_base_get_physical_socket_id(socket);
if (0 > phys_socket) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical socket id for logical socket %ld",
rank, (long)socket);
return OPAL_ERROR;
return phys_socket;
}
} else {
phys_socket = socket;
@ -185,8 +175,6 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
/* get the LOGICAL core info for this socket */
if ( OPAL_SUCCESS != ( rc = opal_paffinity_base_get_core_info(phys_socket, &num_cores))) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Could not get core info for physical socket number %d (%d)",
rank, phys_socket, socket);
return rc;
}
@ -195,9 +183,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
for (core = 0; core < num_cores; core++) {
/* convert to PHYSICAL core id */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
/* get the PHYSICAL processor id for the PHYSICAL socket/core */
if ( OPAL_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id (phys_socket, phys_core, &phys_processor))) {
@ -224,9 +210,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
if (logical_map) {
/* convert to physical core */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;
@ -252,9 +236,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
lower_range = atoi(range[0]);
upper_range = atoi(range[1]);
if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d",
rank, lower_range, upper_range, num_cores);
return OPAL_ERROR;
return OPAL_ERR_SLOT_LIST_RANGE;
}
for (core=lower_range; core<=upper_range; core++) {
if (logical_map) {
@ -262,7 +244,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;
@ -310,9 +292,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
if (logical_map) {
/* convert to physical core */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;
@ -338,17 +318,13 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
lower_range = atoi(range[0]);
upper_range = atoi(range[1]);
if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d",
rank, lower_range, upper_range, num_cores);
return OPAL_ERROR;
return OPAL_ERR_SLOT_LIST_RANGE;
}
for (core=lower_range; core<=upper_range; core++) {
if (logical_map) {
/* convert to physical core */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;
@ -385,9 +361,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
/* need to convert provided socket to a PHYSICAL socket id */
phys_socket = opal_paffinity_base_get_physical_socket_id(socket);
if (0 > phys_socket) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical socket id for logical socket %ld",
rank, (long)socket);
return OPAL_ERROR;
return phys_socket;
}
} else {
phys_socket = socket;
@ -395,8 +369,6 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
/* get the LOGICAL core info for this socket */
if ( OPAL_SUCCESS != ( rc = opal_paffinity_base_get_core_info(phys_socket, &num_cores))) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Could not get core info for physical socket number %d (%d)",
rank, phys_socket, socket);
return rc;
}
@ -405,9 +377,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
for (core = 0; core < num_cores; core++) {
/* convert to PHYSICAL core id */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
/* get the PHYSICAL processor id for the PHYSICAL socket/core */
if ( OPAL_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id (phys_socket, phys_core, &phys_processor))) {
@ -435,9 +405,7 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
if (logical_map) {
/* convert to physical core */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;
@ -463,17 +431,13 @@ static int opal_paffinity_base_socket_core_to_cpu_set(char **socket_core_list, i
lower_range = atoi(range[0]);
upper_range = atoi(range[1]);
if ( 0 > lower_range || num_cores < (upper_range - lower_range) || lower_range >= upper_range ) {
opal_output(0,"Rank %ld: PAFFINITY Error !!! Check your boundaries lower %d upper %d num_cores %d",
rank, lower_range, upper_range, num_cores);
return OPAL_ERROR;
return OPAL_ERR_SLOT_LIST_RANGE;
}
for (core=lower_range; core<=upper_range; core++) {
if (logical_map) {
/* convert to physical core */
if (0 > (phys_core = opal_paffinity_base_get_physical_core_id(phys_socket, core))) {
opal_output(0, "Rank %ld: PAFFINITY cannot get physical core id for logical core %ld in physical socket %ld (%ld)",
rank, (long)core, (long)phys_socket, (long)socket);
return OPAL_ERROR;
return phys_core;
}
} else {
phys_core = core;

Просмотреть файл

@ -36,7 +36,7 @@ int opal_paffinity_base_set(opal_paffinity_base_cpu_set_t cpumask)
int rc;
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
if (OPAL_SUCCESS == (rc = opal_paffinity_base_module->paff_module_set(cpumask))) {
opal_paffinity_base_bound = true;
@ -54,7 +54,7 @@ int opal_paffinity_base_get(opal_paffinity_base_cpu_set_t *cpumask)
}
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
if(NULL == cpumask) {
return OPAL_ERR_BAD_PARAM;
@ -65,7 +65,7 @@ int opal_paffinity_base_get(opal_paffinity_base_cpu_set_t *cpumask)
int opal_paffinity_base_get_map_to_processor_id(int socket, int core, int *processor_id)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_map_to_processor_id(socket, core, processor_id);
}
@ -73,7 +73,7 @@ int opal_paffinity_base_get_map_to_processor_id(int socket, int core, int *proce
int opal_paffinity_base_get_map_to_socket_core(int processor_id, int *socket, int *core)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_map_to_socket_core(processor_id, socket, core);
}
@ -82,7 +82,7 @@ int opal_paffinity_base_get_map_to_socket_core(int processor_id, int *socket, in
int opal_paffinity_base_get_processor_info(int *num_processors)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_processor_info(num_processors);
}
@ -90,7 +90,7 @@ int opal_paffinity_base_get_processor_info(int *num_processors)
int opal_paffinity_base_get_socket_info(int *num_sockets)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_socket_info(num_sockets);
}
@ -98,7 +98,7 @@ int opal_paffinity_base_get_socket_info(int *num_sockets)
int opal_paffinity_base_get_core_info(int socket, int *num_cores)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_core_info(socket, num_cores);
}
@ -106,7 +106,7 @@ int opal_paffinity_base_get_core_info(int socket, int *num_cores)
int opal_paffinity_base_get_physical_processor_id(int logical_processor_id)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_physical_processor_id(logical_processor_id);
}
@ -114,7 +114,7 @@ int opal_paffinity_base_get_physical_processor_id(int logical_processor_id)
int opal_paffinity_base_get_physical_socket_id(int logical_socket_id)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_physical_socket_id(logical_socket_id);
}
@ -122,7 +122,7 @@ int opal_paffinity_base_get_physical_socket_id(int logical_socket_id)
int opal_paffinity_base_get_physical_core_id(int physical_socket_id, int logical_core_id)
{
if (!opal_paffinity_base_selected) {
return OPAL_ERR_NOT_FOUND;
return OPAL_ERR_MODULE_NOT_FOUND;
}
return opal_paffinity_base_module->paff_get_physical_core_id(physical_socket_id, logical_core_id);
}

Просмотреть файл

@ -168,11 +168,39 @@ opal_err2str(int errnum)
case OPAL_ERR_DATA_OVERWRITE_ATTEMPT:
retval = "Attempt to overwrite a data value";
break;
case OPAL_ERR_MODULE_NOT_FOUND:
retval = "Framework requires at least one active module, but none found";
break;
case OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
retval = "OS topology does not support slot_list process affinity";
break;
case OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED:
retval = "Could not obtain socket topology information";
break;
case OPAL_ERR_TOPO_CORE_NOT_SUPPORTED:
retval = "Could not obtain core topology information";
break;
case OPAL_ERR_NOT_ENOUGH_SOCKETS:
retval = "Not enough sockets to meet request";
break;
case OPAL_ERR_NOT_ENOUGH_CORES:
retval = "Not enough cores to meet request";
break;
case OPAL_ERR_INVALID_PHYS_CPU:
retval = "Invalid physical cpu number returned";
break;
case OPAL_ERR_MULTIPLE_AFFINITIES:
retval = "Multiple methods for assigning process affinity were specified";
break;
case OPAL_ERR_SLOT_LIST_RANGE:
retval = "Provided slot_list range is invalid";
break;
default:
retval = NULL;
}
}
return retval;
return retval;
}

Просмотреть файл

@ -31,40 +31,50 @@ enum {
/* Error codes inherited from OPAL. Still enum values so that we
get the nice debugger help. */
ORTE_SUCCESS = OPAL_SUCCESS,
ORTE_SUCCESS = OPAL_SUCCESS,
ORTE_ERROR = OPAL_ERROR,
ORTE_ERR_OUT_OF_RESOURCE = OPAL_ERR_OUT_OF_RESOURCE,
ORTE_ERR_TEMP_OUT_OF_RESOURCE = OPAL_ERR_TEMP_OUT_OF_RESOURCE,
ORTE_ERR_RESOURCE_BUSY = OPAL_ERR_RESOURCE_BUSY,
ORTE_ERR_BAD_PARAM = OPAL_ERR_BAD_PARAM,
ORTE_ERR_FATAL = OPAL_ERR_FATAL,
ORTE_ERR_NOT_IMPLEMENTED = OPAL_ERR_NOT_IMPLEMENTED,
ORTE_ERR_NOT_SUPPORTED = OPAL_ERR_NOT_SUPPORTED,
ORTE_ERR_INTERUPTED = OPAL_ERR_INTERUPTED,
ORTE_ERR_WOULD_BLOCK = OPAL_ERR_WOULD_BLOCK,
ORTE_ERR_IN_ERRNO = OPAL_ERR_IN_ERRNO,
ORTE_ERR_UNREACH = OPAL_ERR_UNREACH,
ORTE_ERR_NOT_FOUND = OPAL_ERR_NOT_FOUND,
ORTE_EXISTS = OPAL_EXISTS,
ORTE_ERR_TIMEOUT = OPAL_ERR_TIMEOUT,
ORTE_ERR_NOT_AVAILABLE = OPAL_ERR_NOT_AVAILABLE,
ORTE_ERR_PERM = OPAL_ERR_PERM,
ORTE_ERR_VALUE_OUT_OF_BOUNDS = OPAL_ERR_VALUE_OUT_OF_BOUNDS,
ORTE_ERR_FILE_READ_FAILURE = OPAL_ERR_FILE_READ_FAILURE,
ORTE_ERR_FILE_WRITE_FAILURE = OPAL_ERR_FILE_WRITE_FAILURE,
ORTE_ERR_FILE_OPEN_FAILURE = OPAL_ERR_FILE_OPEN_FAILURE,
ORTE_ERR_PACK_MISMATCH = OPAL_ERR_PACK_MISMATCH,
ORTE_ERR_PACK_FAILURE = OPAL_ERR_PACK_FAILURE,
ORTE_ERR_UNPACK_FAILURE = OPAL_ERR_UNPACK_FAILURE,
ORTE_ERR_UNPACK_INADEQUATE_SPACE = OPAL_ERR_UNPACK_INADEQUATE_SPACE,
ORTE_ERROR = OPAL_ERROR,
ORTE_ERR_OUT_OF_RESOURCE = OPAL_ERR_OUT_OF_RESOURCE,
ORTE_ERR_TEMP_OUT_OF_RESOURCE = OPAL_ERR_TEMP_OUT_OF_RESOURCE,
ORTE_ERR_RESOURCE_BUSY = OPAL_ERR_RESOURCE_BUSY,
ORTE_ERR_BAD_PARAM = OPAL_ERR_BAD_PARAM,
ORTE_ERR_FATAL = OPAL_ERR_FATAL,
ORTE_ERR_NOT_IMPLEMENTED = OPAL_ERR_NOT_IMPLEMENTED,
ORTE_ERR_NOT_SUPPORTED = OPAL_ERR_NOT_SUPPORTED,
ORTE_ERR_INTERUPTED = OPAL_ERR_INTERUPTED,
ORTE_ERR_WOULD_BLOCK = OPAL_ERR_WOULD_BLOCK,
ORTE_ERR_IN_ERRNO = OPAL_ERR_IN_ERRNO,
ORTE_ERR_UNREACH = OPAL_ERR_UNREACH,
ORTE_ERR_NOT_FOUND = OPAL_ERR_NOT_FOUND,
ORTE_EXISTS = OPAL_EXISTS,
ORTE_ERR_TIMEOUT = OPAL_ERR_TIMEOUT,
ORTE_ERR_NOT_AVAILABLE = OPAL_ERR_NOT_AVAILABLE,
ORTE_ERR_PERM = OPAL_ERR_PERM,
ORTE_ERR_VALUE_OUT_OF_BOUNDS = OPAL_ERR_VALUE_OUT_OF_BOUNDS,
ORTE_ERR_FILE_READ_FAILURE = OPAL_ERR_FILE_READ_FAILURE,
ORTE_ERR_FILE_WRITE_FAILURE = OPAL_ERR_FILE_WRITE_FAILURE,
ORTE_ERR_FILE_OPEN_FAILURE = OPAL_ERR_FILE_OPEN_FAILURE,
ORTE_ERR_PACK_MISMATCH = OPAL_ERR_PACK_MISMATCH,
ORTE_ERR_PACK_FAILURE = OPAL_ERR_PACK_FAILURE,
ORTE_ERR_UNPACK_FAILURE = OPAL_ERR_UNPACK_FAILURE,
ORTE_ERR_UNPACK_INADEQUATE_SPACE = OPAL_ERR_UNPACK_INADEQUATE_SPACE,
ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER = OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER,
ORTE_ERR_TYPE_MISMATCH = OPAL_ERR_TYPE_MISMATCH,
ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED,
ORTE_ERR_UNKNOWN_DATA_TYPE = OPAL_ERR_UNKNOWN_DATA_TYPE,
ORTE_ERR_BUFFER = OPAL_ERR_BUFFER,
ORTE_ERR_DATA_TYPE_REDEF = OPAL_ERR_DATA_TYPE_REDEF,
ORTE_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT,
ORTE_ERR_TYPE_MISMATCH = OPAL_ERR_TYPE_MISMATCH,
ORTE_ERR_OPERATION_UNSUPPORTED = OPAL_ERR_OPERATION_UNSUPPORTED,
ORTE_ERR_UNKNOWN_DATA_TYPE = OPAL_ERR_UNKNOWN_DATA_TYPE,
ORTE_ERR_BUFFER = OPAL_ERR_BUFFER,
ORTE_ERR_DATA_TYPE_REDEF = OPAL_ERR_DATA_TYPE_REDEF,
ORTE_ERR_DATA_OVERWRITE_ATTEMPT = OPAL_ERR_DATA_OVERWRITE_ATTEMPT,
ORTE_ERR_MODULE_NOT_FOUND = OPAL_ERR_MODULE_NOT_FOUND,
ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED = OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED,
ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED = OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED,
ORTE_ERR_TOPO_CORE_NOT_SUPPORTED = OPAL_ERR_TOPO_CORE_NOT_SUPPORTED,
ORTE_ERR_NOT_ENOUGH_SOCKETS = OPAL_ERR_NOT_ENOUGH_SOCKETS,
ORTE_ERR_NOT_ENOUGH_CORES = OPAL_ERR_NOT_ENOUGH_CORES,
ORTE_ERR_INVALID_PHYS_CPU = OPAL_ERR_INVALID_PHYS_CPU,
ORTE_ERR_MULTIPLE_AFFINITIES = OPAL_ERR_MULTIPLE_AFFINITIES,
ORTE_ERR_SLOT_LIST_RANGE = OPAL_ERR_SLOT_LIST_RANGE,
/* error codes specific to ORTE - don't forget to update
orte/util/error_strings.c when adding new error codes!!
Otherwise, the error reporting system will potentially crash,
@ -101,8 +111,9 @@ enum {
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
ORTE_ERR_MODULE_NOT_FOUND = (ORTE_ERR_BASE - 32),
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 33)
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34)
};
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)

Просмотреть файл

@ -46,16 +46,6 @@ This could mean that your PATH or executable name is wrong, or that you do not
have the necessary permissions. Please ensure that the executable is able to be
found and executed.
#
[nodeid-out-of-range]
The id of a node is out of the allowed range.
Value given: %ld
Max value allowed: %ld
This may be resolved by increasing the number of available node id's by
re-configuring Open MPI with the --enable-jumbo-clusters option, and then
re-running the application
#
[odls-default:multiple-paffinity-schemes]
Multiple processor affinity schemes were specified (can only specify one):
@ -130,3 +120,27 @@ binding action:
Application name: %s
Please revise the request and try again.
#
[odls-default:paffinity-missing-module]
A request to bind processes was made, but no paffinity module
was found:
Local host: %s
This is potentially a configuration. You can rerun your job without
requesting binding, or check the configuration.
#
[odls-default:invalid-slot-list-range]
A slot list was provided that exceeds the boundaries on available
resources:
Local host: %s
Slot list: %s
Please check your boundaries and try again.
#
[odls-default:affinity-not-supported]
A request was made to bind processes, but process affinity is
not supported on this node:
Local host: %s

Просмотреть файл

@ -294,7 +294,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
child->pid = pid;
}
if(pid < 0) {
if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
@ -344,10 +344,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
child->slot_list, ORTE_NAME_PRINT(child->name)));
if (opal_paffinity_alone) {
/* It's an error if multiple paffinity schemes were specified */
orte_show_help("help-odls-default.txt",
"odls-default:multiple-paffinity-schemes", true, child->slot_list);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_MULTIPLE_AFFINITIES);
}
if (orte_report_bindings) {
opal_output(0, "%s odls:default:fork binding child %s to slot_list %s",
@ -355,17 +352,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
ORTE_NAME_PRINT(child->name), child->slot_list);
}
if (ORTE_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list, &mask))) {
if (ORTE_ERR_NOT_SUPPORTED == rc) {
/* OS doesn't support providing topology information */
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
child->slot_list, context->app);
ORTE_ODLS_ERROR_OUT(rc);
}
orte_show_help("help-odls-default.txt",
"odls-default:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc));
ORTE_ODLS_ERROR_OUT(rc);
}
/* if we didn't wind up bound, then generate a warning unless suppressed */
@ -381,15 +367,11 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
(int)jobdat->cpus_per_rank, (int)jobdat->stride));
/* get the node rank */
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_NODE_RANK);
}
/* get the local rank */
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-local-rank", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_LOCAL_RANK);
}
/* init the mask */
OPAL_PAFFINITY_CPU_ZERO(mask);
@ -417,22 +399,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
/* if we don't have enough sockets, that is an error */
if (n < logical_skt) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"sockets", orte_process_info.nodename,
"bind-to-core", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS);
}
} else {
target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS doesn't support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-core", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
@ -453,14 +427,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
* from when we initialized
*/
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS does not support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-core", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
@ -477,18 +447,12 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
phys_core = opal_paffinity_base_get_physical_core_id(target_socket, logical_cpu);
if (0 > phys_core) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(phys_core);
}
/* map this to a physical cpu on this node */
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) {
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"processors", orte_process_info.nodename,
"bind-to-core", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(rc);
}
/* are we bound? */
if (orte_odls_globals.bound) {
@ -532,11 +496,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
/* if we don't have enough processors, that is an error */
if (ncpu <= logical_cpu) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"processors", orte_process_info.nodename,
"bind-to-core", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_CORES);
}
} else {
/* if we are not bound, then all processors are available
@ -544,19 +504,9 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
* physical cpu
*/
phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu);
if (OPAL_ERROR == phys_cpu){
/* No processor to bind to so error out */
if (0 > phys_cpu) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"processors", orte_process_info.nodename,
"bind-to-core", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
} else if (0 > phys_cpu) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(phys_cpu);
}
}
OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
@ -573,8 +523,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
ORTE_ODLS_IF_BIND_NOT_REQD(5);
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
ORTE_ODLS_ERROR_OUT(rc);
}
paffinity_enabled = true;
@ -590,9 +538,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
* the provided mapping policy
*/
if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-local-rank", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_INVALID_LOCAL_RANK);
}
if (ORTE_MAPPING_NPERXXX & jobdat->policy) {
/* we need to balance the children from this job across the available sockets */
@ -618,22 +564,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
/* if we don't have enough sockets, that is an error */
if (n < logical_skt) {
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"sockets", orte_process_info.nodename,
"bind-to-socket", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS);
}
} else {
target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS doesn't support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
@ -650,14 +588,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
* from when we initialized
*/
target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS does not support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
"bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank,
@ -687,36 +621,24 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
/* if we don't have enough processors, that is an error */
if (ncpu < logical_cpu) {
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"processors", orte_process_info.nodename,
"bind-to-socket", context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_NOT_ENOUGH_SOCKETS);
}
/* get the physical socket of that cpu */
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core)) {
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core))) {
if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) {
goto LAUNCH_PROCS;
}
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(rc);
}
} else {
/* if we are not bound, then just use all sockets */
if (1 == orte_odls_globals.num_sockets) {
/* if we only have one socket, then just put it there */
target_socket = opal_paffinity_base_get_physical_socket_id(0);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS doesn't support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
} else {
/* compute the logical socket, compensating for the number of cpus_per_rank */
@ -725,14 +647,10 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
logical_skt = logical_skt % orte_odls_globals.num_sockets;
/* now get the target physical socket */
target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt);
if (ORTE_ERR_NOT_SUPPORTED == target_socket) {
if (target_socket < 0) {
/* OS doesn't support providing topology information */
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(target_socket);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
@ -745,18 +663,14 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
for (n=0; n < orte_default_num_cores_per_socket; n++) {
/* get the physical core within this target socket */
phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n);
if (0 > phys_core) {
if (phys_core < 0) {
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(phys_core);
}
/* map this to a physical cpu on this node */
if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) {
if (ORTE_SUCCESS != (rc = opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu))) {
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL);
ORTE_ODLS_ERROR_OUT(rc);
}
/* are we bound? */
if (orte_odls_globals.bound) {
@ -783,8 +697,6 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}
if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) {
ORTE_ODLS_IF_BIND_NOT_REQD(6);
orte_show_help("help-odls-default.txt",
"odls-default:failed-set-paff", true);
ORTE_ODLS_ERROR_OUT(rc);
}
paffinity_enabled = true;
@ -932,12 +844,64 @@ LAUNCH_PROCS:
know about the failure. The actual exit status of child proc
cannot be found here - all we can do is report the ORTE error
code that was reported back to us. The calling func needs to report the
failure to launch this process through the SMR or else
failure to launch this process through the errmgr or else
everyone else will hang.
*/
if (NULL != child) {
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = i;
if (ORTE_ERR_MULTIPLE_AFFINITIES == i) {
/* It's an error if multiple paffinity schemes were specified */
orte_show_help("help-odls-default.txt",
"odls-default:multiple-paffinity-schemes", true, child->slot_list);
} else if (ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED == i) {
/* OS doesn't support providing topology information */
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "rankfile containing a slot_list of ",
child->slot_list, context->app);
} else if (ORTE_ERR_INVALID_NODE_RANK == i) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-node-rank", true);
} else if (ORTE_ERR_INVALID_LOCAL_RANK == i) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-local-rank", true);
} else if (ORTE_ERR_NOT_ENOUGH_CORES == i) {
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"sockets", orte_process_info.nodename,
"bind-to-core", context->app);
} else if (ORTE_ERR_TOPO_CORE_NOT_SUPPORTED == i) {
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-core", "",
context->app);
} else if (ORTE_ERR_INVALID_PHYS_CPU == i) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-phys-cpu", true);
} else if (ORTE_ERR_NOT_ENOUGH_SOCKETS == i) {
orte_show_help("help-odls-default.txt",
"odls-default:not-enough-resources", true,
"sockets", orte_process_info.nodename,
"bind-to-socket", context->app);
} else if (ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED == i) {
orte_show_help("help-odls-default.txt",
"odls-default:topo-not-supported",
true, orte_process_info.nodename, "bind-to-socket", "",
context->app);
} else if (ORTE_ERR_MODULE_NOT_FOUND == i) {
orte_show_help("help-odls-default.txt",
"odls-default:paffinity-missing-module",
true, orte_process_info.nodename);
} else if (ORTE_ERR_SLOT_LIST_RANGE == i) {
orte_show_help("help-odls-default.txt",
"odls-default:invalid-slot-list-range",
true, orte_process_info.nodename, child->slot_list);
} else if (ORTE_ERR_NOT_SUPPORTED == i) {
orte_show_help("help-odls-default.txt",
"odls-default:affinity-not-supported",
true, orte_process_info.nodename);
}
}
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,

Просмотреть файл

@ -126,9 +126,6 @@ const char *orte_err2str(int errnum)
case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP:
retval = "System will determine resources during bootstrap of daemons";
break;
case ORTE_ERR_MODULE_NOT_FOUND:
retval = "Framework requires at least one active module, but none found";
break;
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
retval = "Limit on number of process relocations was exceeded";
break;