Detect homo/hetero scenarios in the nidmap, setup to take appropriate actions in the basic grpcomm module.
NOT for inclusion in v1.3 This commit was SVN r18786.
Этот коммит содержится в:
родитель
160ba5fe11
Коммит
6f85e34d66
@ -481,12 +481,51 @@ static int modex(opal_list_t *procs)
|
||||
* don't need to exchange arch's as they are all identical
|
||||
*/
|
||||
if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) {
|
||||
/* Case 1: If different apps in this job were built differently - e.g., some
|
||||
* are built 32-bit while others are built 64-bit - then we need to modex
|
||||
* regardless of any other consideration. The user is reqd to tell us via a
|
||||
* cmd line option if this situation exists, which will result in an mca param
|
||||
* being set for us, so all we need to do is check for the global boolean
|
||||
* that corresponds to that param
|
||||
*/
|
||||
if (orte_hetero_apps) {
|
||||
modex_reqd = true;
|
||||
}
|
||||
/* Case 2: the nodes are homo and our arch matches the one seen by my daemon. In
|
||||
* this case, we are actually operating homogeneous even though hetero
|
||||
* is supported, so no modex info is required
|
||||
*/
|
||||
if (orte_homogeneous_nodes &&
|
||||
orte_process_info.arch == orte_ess.proc_get_arch(ORTE_PROC_MY_DAEMON)) {
|
||||
modex_reqd = false;
|
||||
}
|
||||
/* Case 2: the nodes are hetero, but the orted and app binaries were built
|
||||
* the same - i.e., either they are both 32-bit, or they are both 64-bit, but
|
||||
* no mixing of the two. In this case, we include the info in the modex
|
||||
*/
|
||||
else if (!orte_homogeneous_nodes) {
|
||||
modex_reqd = true;
|
||||
}
|
||||
/* Case 3: the nodes are homo, but the orted and app binaries were built
|
||||
* differently - i.e., one is built 32-bit, and the other is built 64-bit.
|
||||
* There are two sub-cases here, so we consider them separately
|
||||
*
|
||||
* Case 3(a): all apps were built the same. In this case, we can just
|
||||
* use our own arch and do not need to modex. Since by default we fill-in
|
||||
* the local nidmap with our own arch, we don't need to do anything here
|
||||
*/
|
||||
else if (orte_homogeneous_nodes && orte_hetero_apps) {
|
||||
modex_reqd = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (modex_reqd) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.arch, 1, OPAL_UINT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* pack the entries we have received */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -543,6 +582,10 @@ static int modex(opal_list_t *procs)
|
||||
}
|
||||
|
||||
if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) {
|
||||
/* are the nodes hetero? */
|
||||
if (orte_homogeneous_nodes) {
|
||||
goto unpack_entries;
|
||||
}
|
||||
/* unpack its architecture */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &arch, &cnt, OPAL_UINT32))) {
|
||||
@ -556,6 +599,7 @@ static int modex(opal_list_t *procs)
|
||||
}
|
||||
}
|
||||
|
||||
unpack_entries:
|
||||
/* update the modex database */
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -48,7 +48,6 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
opal_buffer_t buf;
|
||||
int step;
|
||||
int32_t *arch;
|
||||
bool homo;
|
||||
|
||||
/* setup a buffer for tmp use */
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
@ -230,14 +229,14 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
|
||||
if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) {
|
||||
/* check to see if all reported archs are the same */
|
||||
homo = true;
|
||||
orte_homogeneous_nodes = true;
|
||||
for (i=1; i < num_nodes; i++) {
|
||||
if (nodes[i]->arch != nodes[0]->arch) {
|
||||
homo = false;
|
||||
orte_homogeneous_nodes = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (homo) {
|
||||
if (orte_homogeneous_nodes) {
|
||||
/* if everything is homo, just set that
|
||||
* flag - no need to send everything
|
||||
*/
|
||||
@ -455,8 +454,13 @@ vpids:
|
||||
*/
|
||||
n=1;
|
||||
opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8);
|
||||
if (0 != num_digs) {
|
||||
/* hetero situation - get the archs */
|
||||
if (0 == num_digs) {
|
||||
/* homo situation */
|
||||
orte_homogeneous_nodes = true;
|
||||
} else {
|
||||
/* hetero situation */
|
||||
orte_homogeneous_nodes = false;
|
||||
/* get the archs */
|
||||
arch = (int32_t*)malloc(num_nodes * 4);
|
||||
/* unpack the values */
|
||||
n=num_nodes;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user