1
1

Continue development of regular expression support by implementing it for slurm launches. Works for both initial (cmd line and non-cmd line) and comm_spawn launch.

Additional work required to fully enable static port support when using cmd line regular expression launch system.

This commit was SVN r21502.
Этот коммит содержится в:
Ralph Castain 2009-06-23 20:25:38 +00:00
родитель bca8015b94
Коммит 0ba845fed2
17 изменённых файлов: 786 добавлений и 387 удалений

Просмотреть файл

@ -51,6 +51,7 @@
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/nidmap.h" #include "orte/util/nidmap.h"
#include "orte/util/regex.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
@ -184,6 +185,16 @@ int orte_ess_base_orted_setup(char **hosts)
error = "orte_util_nidmap_init"; error = "orte_util_nidmap_init";
goto error; goto error;
} }
if (NULL != orted_launch_cmd) {
/* the launch cmd was given via regexp on the cmd line - parse
* it to get the contact info
*/
if (ORTE_SUCCESS != (ret = orte_regex_decode_maps(orted_launch_cmd, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_regex_decode_maps";
goto error;
}
} else {
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) { if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init"; error = "orte_util_nidmap_init";
@ -197,6 +208,7 @@ int orte_ess_base_orted_setup(char **hosts)
error = "construct daemon map from static ports"; error = "construct daemon map from static ports";
goto error; goto error;
} }
}
/* be sure to update the routing tree so the initial "phone home" /* be sure to update the routing tree so the initial "phone home"
* to mpirun goes through the tree! * to mpirun goes through the tree!
*/ */

Просмотреть файл

@ -60,6 +60,7 @@
#include "orte/util/session_dir.h" #include "orte/util/session_dir.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/nidmap.h" #include "orte/util/nidmap.h"
#include "orte/util/regex.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
@ -103,21 +104,19 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
/* get a pointer to the job map */ /* get a pointer to the job map */
map = jdata->map; map = jdata->map;
/* construct a nodemap */ /* are we passing a regexp? */
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) { if (orte_use_regexp && jdata->num_apps < 2 && NULL == orte_debugger_daemon) {
ORTE_ERROR_LOG(rc); char *regexp;
return rc; flag = 1;
} opal_dss.pack(data, &flag, 1, OPAL_INT8);
regexp = orte_regex_encode_maps(jdata);
/* store it */ opal_dss.pack(data, &regexp, 1, OPAL_STRING);
boptr = &bo; free(regexp);
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) { /* if we are not using static ports, then we need to add the daemon wireup info */
ORTE_ERROR_LOG(rc); if (!orte_static_ports) {
return rc; /* pack a flag indicating that wiring info is provided */
} flag = 1;
/* release the data since it has now been copied into our buffer */ opal_dss.pack(data, &flag, 1, OPAL_INT8);
free(bo.bytes);
/* get wireup info for daemons per the selected routing module */ /* get wireup info for daemons per the selected routing module */
wireup = OBJ_NEW(opal_buffer_t); wireup = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) { if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
@ -154,6 +153,97 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
} }
} }
OBJ_RELEASE(wireup); OBJ_RELEASE(wireup);
} else {
/* pack a flag indicating no wireup info is provided */
flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
}
/* insert an "add-procs" command here so we can cleanly process it on the
* other end
*/
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* since we will have processed this to update daemons, flag that we don't
* have the regexp again
*/
flag = 2;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
/* pack the jobid so it can be extracted later */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* all done */
return ORTE_SUCCESS;
}
/* if we are not passing a regexp, then pass the nodemap */
flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
/* construct a nodemap */
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* store it */
boptr = &bo;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* release the data since it has now been copied into our buffer */
free(bo.bytes);
/* if we are not using static ports, we need to send the wireup info */
if (!orte_static_ports) {
/* pack a flag indicating wiring info is provided */
flag = 1;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
/* get wireup info for daemons per the selected routing module */
wireup = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
return rc;
}
/* if anything was inserted, put it in a byte object for xmission */
if (0 < wireup->bytes_used) {
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
/* pack the number of bytes required by payload */
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
return rc;
}
/* pack the byte object */
bo.size = numbytes;
boptr = &bo;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
return rc;
}
/* release the data since it has now been copied into our buffer */
free(bo.bytes);
} else {
/* pack numbytes=0 so the unpack routine remains sync'd to us */
numbytes = 0;
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
return rc;
}
}
OBJ_RELEASE(wireup);
} else {
/* pack a flag indicating no wireup data is provided */
flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
}
/* insert an "add-procs" command here so we can cleanly process it on the /* insert an "add-procs" command here so we can cleanly process it on the
* other end * other end
@ -164,6 +254,13 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return rc; return rc;
} }
/* pack the flag indicating that we are not using regexps - required to
* keep things in order when unpacking due to different ways the data
* can get to the unpacking routine
*/
flag = 0;
opal_dss.pack(data, &flag, 1, OPAL_INT8);
/* are we co-locating debugger daemons? */ /* are we co-locating debugger daemons? */
if (NULL != orte_debugger_daemon) { if (NULL != orte_debugger_daemon) {
orte_app_context_t **apps; orte_app_context_t **apps;
@ -306,6 +403,23 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int unpack_regexp(orte_odls_job_t **jobdat, opal_buffer_t *data)
{
char *regexp;
int rc, cnt;
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &regexp, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, jobdat))) {
ORTE_ERROR_LOG(rc);
}
free(regexp);
return rc;
}
int orte_odls_base_default_update_daemon_info(opal_buffer_t *data) int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
{ {
opal_buffer_t wireup; opal_buffer_t wireup;
@ -313,8 +427,42 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
int rc; int rc;
orte_std_cntr_t cnt; orte_std_cntr_t cnt;
int32_t numbytes; int32_t numbytes;
int8_t flag;
/* extract the byte object holding the daemonmap */ /* unpack the flag for regexp */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we have a regexp, then process it so we know the daemonmap */
if (0 < flag) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:update:daemon:info updating nidmap from regexp",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = unpack_regexp(NULL, data))) {
ORTE_ERROR_LOG(rc);
}
/* update the routing tree */
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* see if we have wiring info as well */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < flag) {
/* yes - extract and process it */
goto wireup;
}
return rc;
}
/* otherwise, extract the byte object holding the daemonmap */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -345,6 +493,18 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
return rc; return rc;
} }
/* see if we have wiring info as well */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == flag) {
/* no - just return */
return rc;
}
wireup:
/* unpack the #bytes of daemon wireup info in the message */ /* unpack the #bytes of daemon wireup info in the message */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) {
@ -396,14 +556,69 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
"%s odls:constructing child list", "%s odls:constructing child list",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* unpack the returned data to create the required structures /* unpack the flag for regexp */
* for a fork launch. Since the data will contain information cnt=1;
* on procs for ALL nodes, we first have to find the value if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
* struct that contains info for our node. ORTE_ERROR_LOG(rc);
*/ goto REPORT_ERROR;
}
/* set the default values since they may not be included in the data */ if (0 < flag) {
if (1 == flag) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls: constructing jobdat from regexp",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* need to setup the job from the regexp */
if (ORTE_SUCCESS != (rc = unpack_regexp(&jobdat, data))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* record the jobid */
*job = jobdat->jobid;
} else {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls: using jobdat previously extracted from regexp",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* unpack the jobid */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
*job = ORTE_JOBID_INVALID; *job = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
/* find the corresponding jobdat */
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
orte_odls_job_t *jdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdat->jobid == *job) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:construct_child_list found existing jobdat for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
jobdat = jdat;
break;
}
}
if (NULL == jobdat) {
/* we have a problem */
rc = ORTE_ERR_NOT_FOUND;
ORTE_ERROR_LOG(rc);
goto REPORT_ERROR;
}
}
/* fake an app_idx array */
app_idx = (int8_t*)malloc(jobdat->num_procs * sizeof(int8_t));
memset(app_idx, 0, jobdat->num_procs * sizeof(int8_t));
/* if we are doing a timing test, store the time the msg was recvd */
if (orte_timing) {
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
}
goto find_my_procs;
}
/* unpack the flag - are we co-locating debugger daemons? */ /* unpack the flag - are we co-locating debugger daemons? */
cnt=1; cnt=1;
@ -451,6 +666,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
/* unpack the jobid we are to launch */ /* unpack the jobid we are to launch */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
*job = ORTE_JOBID_INVALID;
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto REPORT_ERROR; goto REPORT_ERROR;
} }
@ -475,6 +691,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:construct_child_list found existing jobdat for job %s", "%s odls:construct_child_list found existing jobdat for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
jobdat = jdat;
break; break;
} }
} }
@ -487,13 +704,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
jobdat->jobid = *job; jobdat->jobid = *job;
opal_list_append(&orte_local_jobdata, &jobdat->super); opal_list_append(&orte_local_jobdata, &jobdat->super);
} }
/* if we are doing a timing test, store the time the msg was recvd */ /* if we are doing a timing test, store the time the msg was recvd */
if (orte_timing) { if (orte_timing) {
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec; jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec; jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
} }
/* UNPACK JOB-SPECIFIC DATA */ /* UNPACK JOB-SPECIFIC DATA */
/* unpack the number of nodes involved in this job */ /* unpack the number of nodes involved in this job */
cnt=1; cnt=1;
@ -590,6 +807,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
} }
} }
find_my_procs:
/* cycle through the procs and find mine */ /* cycle through the procs and find mine */
proc.jobid = jobdat->jobid; proc.jobid = jobdat->jobid;
for (j=0; j < jobdat->num_procs; j++) { for (j=0; j < jobdat->num_procs; j++) {
@ -1807,6 +2025,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
orte_std_cntr_t cnt; orte_std_cntr_t cnt;
int rc; int rc;
bool found=false; bool found=false;
int8_t flag;
/* protect operations involving the global list of children */ /* protect operations involving the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex); OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -1888,9 +2107,26 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
/* the proc needs a copy of both the daemon/node map, and /* the proc needs a copy of both the daemon/node map, and
* the process map for its peers * the process map for its peers
*/ */
if (NULL != jobdat->regexp) {
/* the data is in a regexp - send that */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:sync sending regexp %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
jobdat->regexp));
flag = 1;
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
opal_dss.pack(&buffer, &jobdat->regexp, 1, OPAL_STRING);
} else {
/* the data is in the local byte objects - send them */
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:sync sending byte object",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
flag = 0;
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT); opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT); opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
} }
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls: sending sync ack to child %s with %ld bytes of data", "%s odls: sending sync ack to child %s with %ld bytes of data",

Просмотреть файл

@ -111,6 +111,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
ptr->total_slots_alloc = 0; ptr->total_slots_alloc = 0;
ptr->num_procs = 0; ptr->num_procs = 0;
ptr->num_local_procs = 0; ptr->num_local_procs = 0;
ptr->regexp = NULL;
ptr->pmap = NULL; ptr->pmap = NULL;
OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t); OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t);
OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t); OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t);
@ -132,6 +133,10 @@ static void orte_odls_job_destructor(orte_odls_job_t *ptr)
} }
} }
if (NULL != ptr->regexp) {
free(ptr->regexp);
}
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) { if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
free(ptr->pmap->bytes); free(ptr->pmap->bytes);
free(ptr->pmap); free(ptr->pmap);

Просмотреть файл

@ -110,6 +110,7 @@ typedef struct orte_odls_job_t {
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */ orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
orte_vpid_t num_procs; orte_vpid_t num_procs;
int32_t num_local_procs; int32_t num_local_procs;
char *regexp; /* the regular expression describing the job */
opal_byte_object_t *pmap; /* local copy of pidmap byte object */ opal_byte_object_t *pmap; /* local copy of pidmap byte object */
opal_buffer_t collection_bucket; opal_buffer_t collection_bucket;
opal_buffer_t local_collection; opal_buffer_t local_collection;

Просмотреть файл

@ -659,9 +659,16 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
* system pick any port * system pick any port
*/ */
opal_argv_append_nosize(&ports, "0"); opal_argv_append_nosize(&ports, "0");
/* if static ports were specified, flag it
* so the HNP does the right thing
*/
if (NULL != mca_oob_tcp_component.tcp4_static_ports) {
orte_static_ports = true;
} else {
orte_static_ports = false; orte_static_ports = false;
} }
} }
}
#if OPAL_WANT_IPV6 #if OPAL_WANT_IPV6
if (AF_INET6 == af_family) { if (AF_INET6 == af_family) {
@ -715,9 +722,16 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
* system pick any port * system pick any port
*/ */
opal_argv_append_nosize(&ports, "0"); opal_argv_append_nosize(&ports, "0");
/* if static ports were specified, flag it
* so the HNP does the right thing
*/
if (NULL != mca_oob_tcp_component.tcp6_static_ports) {
orte_static_ports = true;
} else {
orte_static_ports = false; orte_static_ports = false;
} }
} }
}
#endif /* OPAL_WANT_IPV6 */ #endif /* OPAL_WANT_IPV6 */
/* bozo check - this should be impossible, but... */ /* bozo check - this should be impossible, but... */

Просмотреть файл

@ -58,13 +58,13 @@
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/regex.h" #include "orte/util/regex.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/base.h"
static bool active_job_completed_callback = false; static bool active_job_completed_callback = false;
static int orte_plm_base_report_launched(orte_jobid_t job);
static char *pretty_print_timing(int64_t secs, int64_t usecs); static char *pretty_print_timing(int64_t secs, int64_t usecs);
int orte_plm_base_setup_job(orte_job_t *jdata) int orte_plm_base_setup_job(orte_job_t *jdata)
@ -109,6 +109,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
*/ */
{ {
char *crud; char *crud;
orte_odls_job_t *jobdat;
crud = orte_regex_encode_maps(jdata); crud = orte_regex_encode_maps(jdata);
opal_output(0, "maps regex: %s", (NULL == crud) ? "NULL" : crud); opal_output(0, "maps regex: %s", (NULL == crud) ? "NULL" : crud);
if (NULL == crud) { if (NULL == crud) {
@ -118,18 +119,30 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
return ORTE_ERROR; return ORTE_ERROR;
} }
orte_util_nidmap_init(NULL); orte_util_nidmap_init(NULL);
orte_regex_decode_maps(crud); orte_regex_decode_maps(crud, &jobdat);
free(crud); free(crud);
/* print-out the map */ /* print-out the map */
orte_nidmap_dump(); orte_nidmap_dump();
orte_jobmap_dump(); orte_jobmap_dump();
/* printout the jobdat */
opal_output(orte_clean_output, "**** DUMP OF JOBDAT %s (%d nodes %d procs) ***",
ORTE_JOBID_PRINT(jobdat->jobid), (int)jobdat->num_nodes, (int)(jobdat->num_procs));
opal_output(orte_clean_output, "\tNum slots: %d\tControl: %x\tStdin: %d",
(int)jobdat->total_slots_alloc, jobdat->controls, (int)jobdat->stdin_target);
opal_output(orte_clean_output, "\tApp: %s", jobdat->apps[0]->app);
opal_output(orte_clean_output, "\tCwd: %s", jobdat->apps[0]->cwd);
crud = opal_argv_join(jobdat->apps[0]->argv, ',');
opal_output(orte_clean_output, "\tArgv: %s", crud);
free(crud);
crud = opal_argv_join(jobdat->apps[0]->env, ',');
opal_output(orte_clean_output, "\tEnv: %s", crud);
free(crud);
orte_never_launched = true; orte_never_launched = true;
ORTE_UPDATE_EXIT_STATUS(0); ORTE_UPDATE_EXIT_STATUS(0);
orte_trigger_event(&orte_exit); orte_trigger_event(&orte_exit);
return ORTE_ERROR; return ORTE_ERROR;
} }
{ {
opal_byte_object_t bo; opal_byte_object_t bo;
@ -928,7 +941,7 @@ static void app_report_launch(int status, orte_process_name_t* sender,
} }
static int orte_plm_base_report_launched(orte_jobid_t job) int orte_plm_base_report_launched(orte_jobid_t job)
{ {
int rc; int rc;
orte_job_t *jdata; orte_job_t *jdata;

Просмотреть файл

@ -94,6 +94,7 @@ ORTE_DECLSPEC int orte_plm_base_set_progress_sched(int sched);
ORTE_DECLSPEC int orte_plm_base_setup_job(orte_job_t *jdata); ORTE_DECLSPEC int orte_plm_base_setup_job(orte_job_t *jdata);
ORTE_DECLSPEC int orte_plm_base_launch_apps(orte_jobid_t job); ORTE_DECLSPEC int orte_plm_base_launch_apps(orte_jobid_t job);
ORTE_DECLSPEC void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, int status, orte_job_state_t state); ORTE_DECLSPEC void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, int status, orte_job_state_t state);
ORTE_DECLSPEC int orte_plm_base_report_launched(orte_jobid_t job);
ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons); ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons);

Просмотреть файл

@ -63,6 +63,7 @@
#include "orte/types.h" #include "orte/types.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/regex.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
@ -159,6 +160,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
int proc_vpid_index; int proc_vpid_index;
orte_jobid_t failed_job; orte_jobid_t failed_job;
bool failed_launch=true; bool failed_launch=true;
bool using_regexp=false;
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) { if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
/* if this is a request to launch a local slave, /* if this is a request to launch a local slave,
@ -320,15 +322,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
argv[proc_vpid_index] = strdup(name_string); argv[proc_vpid_index] = strdup(name_string);
free(name_string); free(name_string);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: final top-level argv:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param));
if (NULL != param) free(param);
}
/* Copy the prefix-directory specified in the /* Copy the prefix-directory specified in the
corresponding app_context. If there are multiple, corresponding app_context. If there are multiple,
different prefix's in the app context, complain (i.e., only different prefix's in the app context, complain (i.e., only
@ -369,6 +362,29 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
opal_setenv(var, "rsh", true, &env); opal_setenv(var, "rsh", true, &env);
free(var); free(var);
/* if we can do it, use the regexp to launch the apps - this
* requires that the user requested this mode, that we were
* provided with static ports, and that we only have one
* app_context
*/
if (orte_use_regexp && orte_static_ports && jdata->num_apps < 2) {
char *regexp;
regexp = orte_regex_encode_maps(jdata);
opal_argv_append(&argc, &argv, "--launch");
opal_argv_append(&argc, &argv, regexp);
free(regexp);
using_regexp = true;
}
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: final top-level argv:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param));
if (NULL != param) free(param);
}
/* exec the daemon(s) */ /* exec the daemon(s) */
if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -393,6 +409,18 @@ launch_apps:
/* get here if daemons launch okay - any failures now by apps */ /* get here if daemons launch okay - any failures now by apps */
launching_daemons = false; launching_daemons = false;
failed_job = active_job; failed_job = active_job;
if (using_regexp) {
/* daemons already have launch cmd - just wait for them to
* report back
*/
if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:slurm:launch failed for job %s on error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
goto cleanup;
}
} else {
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) { if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurm: launch of apps failed for job %s on error %s", "%s plm:slurm: launch of apps failed for job %s on error %s",
@ -400,6 +428,7 @@ launch_apps:
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc))); ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
goto cleanup; goto cleanup;
} }
}
/* declare the launch a success */ /* declare the launch a success */
failed_launch = false; failed_launch = false;

Просмотреть файл

@ -276,7 +276,7 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec; orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec;
orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec; orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec;
} }
/* cmd contains daemon update info - process it */ /* the cmd contains daemon update info - process it */
if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) { if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) {
ORTE_ERROR_LOG(ret); ORTE_ERROR_LOG(ret);
goto CLEANUP; goto CLEANUP;

Просмотреть файл

@ -193,6 +193,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
NULL, OPAL_CMD_LINE_TYPE_STRING, NULL, OPAL_CMD_LINE_TYPE_STRING,
"Create a new xterm window and display output from the specified ranks there" }, "Create a new xterm window and display output from the specified ranks there" },
{ NULL, NULL, NULL, '\0', "launch", "launch", 1,
&orted_launch_cmd, OPAL_CMD_LINE_TYPE_STRING,
"A regular expression describing the job to be launched at startup" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -685,6 +689,20 @@ int orte_daemon(int argc, char *argv[])
ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat); ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat);
} }
/* if we were given a launch string, then process it */
if (NULL != orted_launch_cmd) {
opal_buffer_t launch;
int8_t flag;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS;
OBJ_CONSTRUCT(&launch, opal_buffer_t);
opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD);
flag = 1;
opal_dss.pack(&launch, &flag, 1, OPAL_INT8);
opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING);
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
OBJ_DESTRUCT(&launch);
}
/* wait to hear we are done */ /* wait to hear we are done */
opal_event_dispatch(); opal_event_dispatch();

Просмотреть файл

@ -64,7 +64,6 @@ bool orte_hetero_apps = false;
bool orte_never_launched = false; bool orte_never_launched = false;
bool orte_devel_level_output = false; bool orte_devel_level_output = false;
int32_t orte_contiguous_nodes;
char **orte_launch_environ; char **orte_launch_environ;
bool orte_hnp_is_allocated = false; bool orte_hnp_is_allocated = false;
@ -105,6 +104,8 @@ bool orte_send_profile;
/* Nidmap and job maps */ /* Nidmap and job maps */
opal_pointer_array_t orte_nidmap; opal_pointer_array_t orte_nidmap;
opal_pointer_array_t orte_jobmap; opal_pointer_array_t orte_jobmap;
bool orte_use_regexp;
char *orted_launch_cmd = NULL;
/* list of local children on a daemon */ /* list of local children on a daemon */
opal_list_t orte_local_children; opal_list_t orte_local_children;

Просмотреть файл

@ -456,7 +456,6 @@ ORTE_DECLSPEC extern bool orte_leave_session_attached;
ORTE_DECLSPEC extern bool orte_do_not_launch; ORTE_DECLSPEC extern bool orte_do_not_launch;
ORTE_DECLSPEC extern bool orted_spin_flag; ORTE_DECLSPEC extern bool orted_spin_flag;
ORTE_DECLSPEC extern bool orte_static_ports; ORTE_DECLSPEC extern bool orte_static_ports;
ORTE_DECLSPEC extern int32_t orte_contiguous_nodes;
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames; ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
ORTE_DECLSPEC extern int orted_debug_failure; ORTE_DECLSPEC extern int orted_debug_failure;
@ -508,6 +507,8 @@ ORTE_DECLSPEC extern bool orte_send_profile;
/* Nidmap and job maps */ /* Nidmap and job maps */
ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap; ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap;
ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap; ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap;
ORTE_DECLSPEC extern bool orte_use_regexp;
ORTE_DECLSPEC extern char *orted_launch_cmd;
/* list of local children on a daemon */ /* list of local children on a daemon */
ORTE_DECLSPEC extern opal_list_t orte_local_children; ORTE_DECLSPEC extern opal_list_t orte_local_children;

Просмотреть файл

@ -205,10 +205,11 @@ int orte_register_params(void)
false, false, (int)false, &value); false, false, (int)false, &value);
orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value); orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value);
/* whether or not contiguous nodenames are in use */ /* whether or not to use regular expressions for launch */
mca_base_param_reg_int_name("orte", "contiguous_nodes", mca_base_param_reg_int_name("orte", "use_regexp",
"Number of nodes after which contiguous nodename encoding will automatically be used [default: INT_MAX]", "Whether or not to use regular expressions for launch [default: no]",
false, false, INT32_MAX, &orte_contiguous_nodes); false, false, (int)false, &value);
orte_use_regexp = OPAL_INT_TO_BOOL(value);
/* whether to tag output */ /* whether to tag output */
mca_base_param_reg_int_name("orte", "tag_output", mca_base_param_reg_int_name("orte", "tag_output",

Просмотреть файл

@ -353,6 +353,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_BOOL, NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Output a brief periodic report on launch progress" }, "Output a brief periodic report on launch progress" },
{ "orte", "use", "regexp", '\0', "use-regexp", "use-regexp", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Use regular expressions for launch" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }

Просмотреть файл

@ -53,6 +53,7 @@
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/regex.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/rml/base/rml_contact.h"
@ -65,6 +66,8 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
int32_t cnt; int32_t cnt;
int rc; int rc;
opal_byte_object_t *bo; opal_byte_object_t *bo;
int8_t flag;
char *regexp;
if (!initialized) { if (!initialized) {
/* need to construct the global arrays */ /* need to construct the global arrays */
@ -85,6 +88,29 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* extract the flag indicating the type of info in the buffer */
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < flag) {
/* the data is a regular expression - extract and parse it
* to get the daemonmap and process map
*/
cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &regexp, &cnt, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, NULL))) {
ORTE_ERROR_LOG(rc);
}
free(regexp);
return rc;
}
/* extract the byte object holding the daemonmap */ /* extract the byte object holding the daemonmap */
cnt=1; cnt=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) { if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
@ -263,15 +289,11 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
{ {
orte_vpid_t *vpids; orte_vpid_t *vpids;
orte_node_t **nodes; orte_node_t **nodes;
char prefix[ORTE_MAX_NODE_PREFIX], *tmp; int32_t i, num_nodes;
int32_t i, len, firstnode, lastnode, nodenum, num_nodes;
uint8_t command = ORTE_CONTIG_NODE_CMD;
uint8_t num_digs; uint8_t num_digs;
uint8_t incdec;
int rc; int rc;
char *nodename; char *nodename;
opal_buffer_t buf; opal_buffer_t buf;
int step;
int32_t *arch; int32_t *arch;
/* setup a buffer for tmp use */ /* setup a buffer for tmp use */
@ -314,141 +336,7 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
} }
} }
/* see if the cluster is configured with contiguous /* pack every nodename individually */
* node names and we have more than the HNP
*/
if (orte_contiguous_nodes < num_nodes) {
/* discover the prefix - find first non-alpha character */
len = strlen(nodes[1]->name);
memset(prefix, 0, ORTE_MAX_NODE_PREFIX);
prefix[0] = nodes[1]->name[0]; /* must start with alpha */
for (i=1; i < len; i++) {
if (!isalpha(nodes[1]->name[i])) {
/* found a non-alpha char */
if (!isdigit(nodes[1]->name[i])) {
/* if it is anything but a digit,
* then that's not good
*/
opal_output(0, "%s encode:nidmap Nodename pattern is nonstandard",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return ORTE_ERROR;
}
/* okay, this defines end of the prefix.
* convert rest of name to an offset
*/
firstnode = strtol(&(nodes[1]->name[i]), NULL, 10);
/* figure out how many digits are in the index */
for (num_digs=0; isdigit(nodes[1]->name[i+num_digs]); num_digs++);
goto PACK;
}
prefix[i] = nodes[1]->name[i];
}
PACK:
/* begin encoding rest of map by indicating that this will
* be a contiguous node map
*/
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the prefix */
tmp = &prefix[0];
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
len = strlen(prefix);
/* pack the number of digits in the index */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and the starting offset */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &firstnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes prefix %s num_digits %d offset %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, firstnode));
lastnode = strtol(&(nodes[2]->name[i]), NULL, 10);
if ((lastnode - firstnode) < 0) {
/* we are decrementing */
incdec = 0;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* we are incrementing */
incdec = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
lastnode = firstnode;
/* cycle through the nodes - pack the starting offset
* and total number of nodes in each contiguous range
*/
for (i=2; i < num_nodes; i++) {
nodenum = strtol(&(nodes[i]->name[len]), NULL, 10);
step = nodenum -lastnode;
if (step < 0) {
/* we are decrementing */
step = lastnode - nodenum;
}
if (step > 1) {
/* have a break - indicate end of range */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* indicate start of new range */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodenum, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes end range %d start next range %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode, nodenum));
}
lastnode = nodenum;
}
/* pack end of range */
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap:contig_nodes end range %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode));
/* pack flag end of ranges */
lastnode = -1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* if the nodes aren't contiguous, then we need
* to simply pack every nodename individually
*/
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s encode:nidmap non_contig_nodes - packing all names",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* indicate that this will not be a contiguous node map */
command = ORTE_NON_CONTIG_NODE_CMD;
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
for (i=1; i < num_nodes; i++) { for (i=1; i < num_nodes; i++) {
if (!orte_keep_fqdn_hostnames) { if (!orte_keep_fqdn_hostnames) {
char *ptr; char *ptr;
@ -468,7 +356,6 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
} }
} }
} }
}
/* since the daemon vpids may not correspond to the node /* since the daemon vpids may not correspond to the node
* index, we need to also pack the vpid array for all * index, we need to also pack the vpid array for all
@ -588,15 +475,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
int orte_util_decode_nodemap(opal_byte_object_t *bo) int orte_util_decode_nodemap(opal_byte_object_t *bo)
{ {
int n, loc, k, diglen, namelen; int n;
char *prefix, digits[10]; int32_t num_nodes, i, num_daemons;
int32_t num_nodes, lastnode, endrange, i, num_daemons;
orte_nid_t *node; orte_nid_t *node;
orte_vpid_t *vpids; orte_vpid_t *vpids;
uint8_t command, num_digs; uint8_t num_digs;
orte_nid_t **nd, *ndptr; orte_nid_t **nd, *ndptr;
uint8_t incdec;
int32_t index, step;
int32_t *arch; int32_t *arch;
opal_buffer_t buf; opal_buffer_t buf;
opal_byte_object_t *boptr; opal_byte_object_t *boptr;
@ -655,106 +539,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
return rc; return rc;
} }
/* unpack flag to see if this is a contiguous node map or not */ /* loop over nodes and unpack the raw nodename */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &command, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (ORTE_CONTIG_NODE_CMD == command) {
/* unpack the prefix */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &prefix, &n, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* the number of digits in the index */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* and the starting offset */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack increment/decrement flag */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &incdec, &n, OPAL_INT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the end of the range */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* setup loop params */
if (0 == incdec) {
endrange -= 1;
step = -1;
} else {
endrange += 1;
step = 1;
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s decode:nidmap:contig_nodes prefix %s num_digits %d offset %d endrange %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, lastnode, endrange));
namelen = strlen(prefix) + num_digs + 1;
/* cycle through the ranges */
index = 1;
while (1) {
for (i=lastnode; i != endrange; i += step) {
node = OBJ_NEW(orte_nid_t);
/* allocate space for the nodename */
node->name = (char*)malloc(namelen);
memset(node->name, 0, namelen);
loc = snprintf(node->name, namelen, "%s", prefix);
diglen = num_digs - snprintf(digits, 10, "%d", i);
for (k=0; k < diglen && loc < namelen; k++) {
node->name[loc] = '0';
loc++;
}
strncat(node->name, digits, num_digs);
/* the arch defaults to our arch so that non-hetero
* case will yield correct behavior
*/
opal_pointer_array_set_item(&orte_nidmap, index, node);
index++;
}
/* unpack start of new range */
n=1;
opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32);
/* if that is -1, then it flags no more ranges */
if (-1 == lastnode) {
goto process_daemons;
}
n=1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 == incdec) {
endrange -= 1;
} else {
endrange += 1;
}
}
} else {
/* not contiguous - just loop over nodes and
* unpack the raw nodename
*/
for (i=1; i < num_nodes; i++) { for (i=1; i < num_nodes; i++) {
node = OBJ_NEW(orte_nid_t); node = OBJ_NEW(orte_nid_t);
/* the arch defaults to our arch so that non-hetero /* the arch defaults to our arch so that non-hetero
@ -769,9 +554,7 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
return rc; return rc;
} }
} }
}
process_daemons:
/* unpack the daemon names */ /* unpack the daemon names */
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t)); vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
n=num_nodes; n=num_nodes;
@ -1141,7 +924,7 @@ cleanup:
orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job) orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
{ {
int i; int i;
orte_jmap_t **jmaps; orte_jmap_t *jmap;
/* unfortunately, job objects cannot be stored /* unfortunately, job objects cannot be stored
* by index number as the jobid is a constructed * by index number as the jobid is a constructed
@ -1151,17 +934,16 @@ orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
* left-justified as cleanup is done - and array * left-justified as cleanup is done - and array
* entries set to NULL - upon job completion. * entries set to NULL - upon job completion.
*/ */
jmaps = (orte_jmap_t**)orte_jobmap.addr;
for (i=0; i < orte_jobmap.size; i++) { for (i=0; i < orte_jobmap.size; i++) {
if (NULL == jmaps[i]) { if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
continue; continue;
} }
OPAL_OUTPUT_VERBOSE((10, orte_debug_output, OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
"%s lookup:pmap: checking job %s for job %s", "%s lookup:pmap: checking job %s for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jmaps[i]->job), ORTE_JOBID_PRINT(job))); ORTE_JOBID_PRINT(jmap->job), ORTE_JOBID_PRINT(job)));
if (job == jmaps[i]->job) { if (job == jmap->job) {
return jmaps[i]; return jmap;
} }
} }
@ -1192,19 +974,18 @@ orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc)
static orte_nid_t* find_daemon_node(orte_process_name_t *proc) static orte_nid_t* find_daemon_node(orte_process_name_t *proc)
{ {
int32_t i; int32_t i;
orte_nid_t **nids; orte_nid_t *nid;
nids = (orte_nid_t**)orte_nidmap.addr;
for (i=0; i < orte_nidmap.size; i++) { for (i=0; i < orte_nidmap.size; i++) {
if (NULL == nids[i]) { if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
continue; continue;
} }
OPAL_OUTPUT_VERBOSE((10, orte_debug_output, OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
"%s find:daemon:node: checking daemon %s for %s", "%s find:daemon:node: checking daemon %s for %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_VPID_PRINT(nids[i]->daemon), ORTE_VPID_PRINT(proc->vpid))); ORTE_VPID_PRINT(nid->daemon), ORTE_VPID_PRINT(proc->vpid)));
if (nids[i]->daemon == proc->vpid) { if (nid->daemon == proc->vpid) {
return nids[i]; return nid;
} }
} }

Просмотреть файл

@ -26,12 +26,30 @@
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif #endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_IFADDRS_H
#include <ifaddrs.h>
#endif
#include "opal/util/argv.h" #include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/regex.h" #include "orte/util/regex.h"
@ -445,19 +463,23 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
char prefix[ORTE_MAX_NODE_PREFIX]; char prefix[ORTE_MAX_NODE_PREFIX];
int startnum; int startnum;
opal_list_item_t *item; opal_list_item_t *item;
char **regexargs = NULL, *tmp; char **regexargs = NULL, *tmp, *tmp2;
int32_t num_nodes, start, cnt, ppn, nppn; int32_t num_nodes, start, cnt, ppn, nppn;
orte_vpid_t vpid_start, start_vpid, end_vpid, base; orte_vpid_t vpid_start, start_vpid, end_vpid, base;
char *regexp = NULL; char *regexp = NULL;
bool byslot; bool byslot;
orte_node_rank_t node_rank, nrank; orte_node_rank_t node_rank, nrank;
char suffix, sfx; char suffix, sfx;
orte_app_context_t *app;
/* this is only supported with regular maps - i.e., when /* this is only supported with regular maps - i.e., when
* the mapping is byslot or bynode. Irregular maps cannot * the mapping is byslot or bynode. Irregular maps cannot
* be expressed in a regular expression * be expressed in a regular expression
*
* Also only supported for one app_context
*/ */
if (jdata->map->policy & ORTE_RMAPS_BYUSER) { if (jdata->map->policy & ORTE_RMAPS_BYUSER ||
jdata->num_apps > 1) {
return NULL; return NULL;
} }
@ -598,6 +620,39 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
opal_argv_append_nosize(&regexargs, tmp); opal_argv_append_nosize(&regexargs, tmp);
free(tmp); free(tmp);
/* next comes the total slots allocated to us */
asprintf(&tmp, "SLOTS=%d", (int)jdata->total_slots_alloc);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the control flags for this job */
asprintf(&tmp, "CTRLS=%d", (int)jdata->controls);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the stdin target for the job */
asprintf(&tmp, "STDIN=%d", (int)jdata->stdin_target);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* the app_context for the job - can only be one! Just include
* the required portions
*/
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
asprintf(&tmp, "APP=\"%s:%s\"", app->app, app->cwd);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
tmp2 = opal_argv_join(app->argv, '#');
asprintf(&tmp, "ARGV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
free(tmp2);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
tmp2 = opal_argv_join(app->env, '#');
asprintf(&tmp, "ENV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
free(tmp2);
opal_argv_append_nosize(&regexargs, tmp);
free(tmp);
/* next comes the starting daemon vpid */ /* next comes the starting daemon vpid */
asprintf(&tmp, "DVPID=%s", ORTE_VPID_PRINT(jdata->map->daemon_vpid_start)); asprintf(&tmp, "DVPID=%s", ORTE_VPID_PRINT(jdata->map->daemon_vpid_start));
opal_argv_append_nosize(&regexargs, tmp); opal_argv_append_nosize(&regexargs, tmp);
@ -855,10 +910,10 @@ cleanup:
return rc; return rc;
} }
int orte_regex_decode_maps(char *regexp) int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat)
{ {
char **seqs, *ptr, **names; char **seqs, *ptr, **names, *ptr2, check[5];
int i, j, k, n, rc; int i, j, k, n, entry, rc;
int ppn, step, start_nrank, nrank; int ppn, step, start_nrank, nrank;
int32_t tmp32; int32_t tmp32;
orte_vpid_t daemon_vpid, vpid; orte_vpid_t daemon_vpid, vpid;
@ -867,27 +922,46 @@ int orte_regex_decode_maps(char *regexp)
orte_jmap_t *jmap; orte_jmap_t *jmap;
orte_pmap_t *pmap; orte_pmap_t *pmap;
bool found; bool found;
orte_odls_job_t *jdat;
orte_app_context_t *app;
opal_list_item_t *item;
int num_procs, num_nodes;
struct hostent *h;
opal_buffer_t buf;
char *uri, *addr;
orte_process_name_t proc;
char *proc_name;
bool hnp_entry;
/* if regexp is NULL, then nothing to parse */ /* if regexp is NULL, then nothing to parse */
if (NULL == regexp) { if (NULL == regexp) {
return ORTE_ERR_SILENT; return ORTE_ERR_SILENT;
} }
/* ensure the global nidmap/pidmap arrays are initialized */
if (ORTE_SUCCESS != (rc = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* break the regexp into its component parts - this is trivial /* break the regexp into its component parts - this is trivial
* because they are all separated by commas! * because they are all separated by commas!
*/ */
seqs = opal_argv_split(regexp, ','); seqs = opal_argv_split(regexp, ',');
/* we need to have at least three elements or something is wrong */ /* we need to have at least six elements or something is wrong */
if (opal_argv_count(seqs) < 3) { if (opal_argv_count(seqs) < 6) {
opal_argv_free(seqs); opal_argv_free(seqs);
return ORTE_ERROR; return ORTE_ERROR;
} }
/* start parsing with the first entry */
entry=0;
/* the first entry is the local jobid, so we extract that and /* the first entry is the local jobid, so we extract that and
* convert it into a global jobid * convert it into a global jobid
*/ */
ptr = strchr(seqs[0], '='); ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) { if (NULL == ptr) {
opal_argv_free(seqs); opal_argv_free(seqs);
return ORTE_ERROR; return ORTE_ERROR;
@ -915,8 +989,146 @@ int orte_regex_decode_maps(char *regexp)
opal_pointer_array_add(&orte_jobmap, jmap); opal_pointer_array_add(&orte_jobmap, jmap);
} }
/* the second entry is the starting daemon vpid for the job being launched */ jdat = NULL;
ptr = strchr(seqs[0], '='); if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* even though we are unpacking an add_local_procs cmd, we cannot assume
* that no job record for this jobid exists. A race condition exists that
* could allow another daemon's procs to call us with a collective prior
* to our unpacking add_local_procs. So lookup the job record for this jobid
* and see if it already exists
*/
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
orte_odls_job_t *jdt = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jdt->jobid == jobid) {
jdat = jdt;
break;
}
}
if (NULL == jdat) {
/* setup jobdat object for this job */
jdat = OBJ_NEW(orte_odls_job_t);
jdat->jobid = jobid;
opal_list_append(&orte_local_jobdata, &jdat->super);
}
if (NULL != jobdat) {
*jobdat = jdat;
}
/* see if this was previously decoded */
if (NULL != jdat->regexp) {
/* yep - don't decode it again */
opal_argv_free(seqs);
return ORTE_SUCCESS;
}
/* next entry is the total slots allocated to this job */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->total_slots_alloc = strtol(ptr, NULL, 10);
/* next entry is the control flags for the job */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->controls = strtol(ptr, NULL, 10);
/* next entry - stdin target */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
jdat->stdin_target = strtol(ptr, NULL, 10);
/* next entry - the app_context itself */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
/* create the app_context object */
app = OBJ_NEW(orte_app_context_t);
jdat->apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t*));
jdat->apps[0] = app;
jdat->num_apps = 1;
/* get the app and the cwd by hand */
ptr2 = strchr(ptr, ':');
*ptr2 = '\0';
app->app = strdup(ptr);
ptr = ++ptr2;
app->cwd = strdup(ptr);
/* the next entry is the argv for the app_context, separated by '#'. We
* assume we can use argv_split for this purpose. First check, though, for
* NULL, indicating there were no argvs
*/
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
for (i=0; i < 4; i++) {
check[i] = ptr[i];
}
check[4] = '\0';
if (0 != strcmp("NULL", check)) {
/* there are argvs */
app->argv = opal_argv_split(ptr, '#');
}
/* the next entry is the env for the app_context, also separated by '#'.
* Again, start by checking for NULL
*/
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) {
opal_argv_free(seqs);
return ORTE_ERROR;
}
ptr++;
/* some shells will strip the starting and ending quotes, and some won't -
* so check for them here
*/
if ('\"' == *ptr) ptr++;
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
for (i=0; i < 4; i++) {
check[i] = ptr[i];
}
check[4] = '\0';
if (0 != strcmp("NULL", check)) {
/* there are argvs */
app->env = opal_argv_split(ptr, '#');
}
} else {
entry += 6;
}
/* next entry is the starting daemon vpid for the job being launched */
ptr = strchr(seqs[entry++], '=');
if (NULL == ptr) { if (NULL == ptr) {
opal_argv_free(seqs); opal_argv_free(seqs);
return ORTE_ERROR; return ORTE_ERROR;
@ -937,7 +1149,11 @@ int orte_regex_decode_maps(char *regexp)
* each node * each node
*/ */
names = NULL; names = NULL;
for (n=2; n < opal_argv_count(seqs); n++) { num_procs = 0;
num_nodes = 0;
hnp_entry = true;
OBJ_CONSTRUCT(&buf, opal_buffer_t);
for (n=entry; n < opal_argv_count(seqs); n++) {
/* parse the node entry to get a list of all node names in it */ /* parse the node entry to get a list of all node names in it */
if (ORTE_SUCCESS != (rc = parse_node_range(seqs[n], &names, &vpid, &ppn, &step, &start_nrank))) { if (ORTE_SUCCESS != (rc = parse_node_range(seqs[n], &names, &vpid, &ppn, &step, &start_nrank))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -963,22 +1179,64 @@ int orte_regex_decode_maps(char *regexp)
nid->name = strdup(names[i]); nid->name = strdup(names[i]);
nid->index = opal_pointer_array_add(&orte_nidmap, nid); nid->index = opal_pointer_array_add(&orte_nidmap, nid);
} }
/* are there any procs on this node? */ /* is this the hnp entry (very first one), or are there any procs on this node? */
if (ORTE_VPID_INVALID != vpid) { if (hnp_entry || ORTE_VPID_INVALID != vpid) {
/* yep - add a daemon if we don't already one, otherwise /* yep - add a daemon if we don't already one, otherwise
* this is just adding procs to an existing daemon * this is just adding procs to an existing daemon
*/ */
if (ORTE_VPID_INVALID != daemon_vpid && if (ORTE_VPID_INVALID != daemon_vpid &&
ORTE_VPID_INVALID == nid->daemon) { ORTE_VPID_INVALID == nid->daemon) {
/* no daemon assigned yet - add it */ /* no daemon assigned yet - add it */
nid->daemon = daemon_vpid; if (hnp_entry) {
daemon_vpid++; /* the hnp is always daemon=0 */
nid->daemon = 0;
hnp_entry = false; /* only do this once */
} else {
nid->daemon = daemon_vpid++;
} }
/* if we are using static ports, create the contact info
* for the daemon on this node
*/
if (orte_static_ports) {
/* lookup the address of this node */
if (NULL == (h = gethostbyname(nid->name))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
OPAL_OUTPUT_VERBOSE((0, orte_debug_output,
"%s orte:regex: constructing static path to node %s daemon %d addr %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
nid->name, (int)nid->daemon, addr));
/* since we are using static ports, all my fellow daemons will be on my
* port. Setup the contact info for each daemon in my hash tables. Note
* that this will -not- open a port to those daemons, but will only
* define the info necessary for opening such a port if/when I communicate
* to them
*/
/* construct the URI */
proc.jobid = ORTE_PROC_MY_NAME->jobid;
proc.vpid = nid->daemon;
orte_util_convert_process_name_to_string(&proc_name, &proc);
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
free(proc_name);
free(uri);
}
}
/* cycle through the ppn, adding a pmap /* cycle through the ppn, adding a pmap
* for each new rank * for each new rank
*/ */
nrank = start_nrank; nrank = start_nrank;
for (k=0; k < ppn; k++) { for (k=0; k < ppn; k++) {
if (NULL != opal_pointer_array_get_item(&jmap->pmap, vpid)) {
/* this proc was already entered via some earlier step */
vpid += step;
continue;
}
pmap = OBJ_NEW(orte_pmap_t); pmap = OBJ_NEW(orte_pmap_t);
pmap->node = nid->index; pmap->node = nid->index;
pmap->local_rank = k; pmap->local_rank = k;
@ -986,12 +1244,35 @@ int orte_regex_decode_maps(char *regexp)
jmap->num_procs++; jmap->num_procs++;
opal_pointer_array_set_item(&jmap->pmap, vpid, pmap); opal_pointer_array_set_item(&jmap->pmap, vpid, pmap);
vpid += step; vpid += step;
/* increment #procs in the job */
num_procs++;
} }
/* increment #nodes in the job */
num_nodes++;
} }
} }
opal_argv_free(names); opal_argv_free(names);
names = NULL; names = NULL;
} }
/* if we are using static ports, load the hash tables */
if (orte_static_ports) {
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
ORTE_ERROR_LOG(rc);
}
}
OBJ_DESTRUCT(&buf);
opal_argv_free(seqs); opal_argv_free(seqs);
if (NULL != jdat) {
/* record the regexp so it can be sent to the local procs */
jdat->regexp = strdup(regexp);
/* save the job data */
jdat->num_procs += num_procs;
jdat->num_nodes += num_nodes;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -25,6 +25,7 @@
#include "orte_config.h" #include "orte_config.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -35,7 +36,7 @@ ORTE_DECLSPEC int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn)
ORTE_DECLSPEC char* orte_regex_encode_maps(orte_job_t *jdata); ORTE_DECLSPEC char* orte_regex_encode_maps(orte_job_t *jdata);
ORTE_DECLSPEC int orte_regex_decode_maps(char *regexp); ORTE_DECLSPEC int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat);
END_C_DECLS END_C_DECLS
#endif #endif