Continue development of regular expression support by implementing it for slurm launches. Works for both initial (cmd line and non-cmd line) and comm_spawn launch.
Additional work required to fully enable static port support when using cmd line regular expression launch system. This commit was SVN r21502.
Этот коммит содержится в:
родитель
bca8015b94
Коммит
0ba845fed2
@ -51,6 +51,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
|
||||
@ -184,18 +185,29 @@ int orte_ess_base_orted_setup(char **hosts)
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
if (NULL != orted_launch_cmd) {
|
||||
/* the launch cmd was given via regexp on the cmd line - parse
|
||||
* it to get the contact info
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_decode_maps(orted_launch_cmd, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_regex_decode_maps";
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (ret = orte_util_setup_local_nidmap_entries())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
/* extract the node info from the environment and
|
||||
* build a nidmap from it
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "construct daemon map from static ports";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
/* be sure to update the routing tree so the initial "phone home"
|
||||
* to mpirun goes through the tree!
|
||||
|
@ -60,6 +60,7 @@
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
@ -103,6 +104,85 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
/* get a pointer to the job map */
|
||||
map = jdata->map;
|
||||
|
||||
/* are we passing a regexp? */
|
||||
if (orte_use_regexp && jdata->num_apps < 2 && NULL == orte_debugger_daemon) {
|
||||
char *regexp;
|
||||
flag = 1;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
regexp = orte_regex_encode_maps(jdata);
|
||||
opal_dss.pack(data, ®exp, 1, OPAL_STRING);
|
||||
free(regexp);
|
||||
/* if we are not using static ports, then we need to add the daemon wireup info */
|
||||
if (!orte_static_ports) {
|
||||
/* pack a flag indicating that wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* get wireup info for daemons per the selected routing module */
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* if anything was inserted, put it in a byte object for xmission */
|
||||
if (0 < wireup->bytes_used) {
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the number of bytes required by payload */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* pack the byte object */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
} else {
|
||||
/* pack numbytes=0 so the unpack routine remains sync'd to us */
|
||||
numbytes = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
} else {
|
||||
/* pack a flag indicating no wireup info is provided */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
}
|
||||
/* insert an "add-procs" command here so we can cleanly process it on the
|
||||
* other end
|
||||
*/
|
||||
command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* since we will have processed this to update daemons, flag that we don't
|
||||
* have the regexp again
|
||||
*/
|
||||
flag = 2;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* pack the jobid so it can be extracted later */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &job, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* all done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* if we are not passing a regexp, then pass the nodemap */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* construct a nodemap */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_encode_nodemap(&bo))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -117,43 +197,53 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
|
||||
/* get wireup info for daemons per the selected routing module */
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
||||
/* if we are not using static ports, we need to send the wireup info */
|
||||
if (!orte_static_ports) {
|
||||
/* pack a flag indicating wiring info is provided */
|
||||
flag = 1;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
/* get wireup info for daemons per the selected routing module */
|
||||
wireup = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* if anything was inserted, put it in a byte object for xmission */
|
||||
if (0 < wireup->bytes_used) {
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the number of bytes required by payload */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* pack the byte object */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
} else {
|
||||
/* pack numbytes=0 so the unpack routine remains sync'd to us */
|
||||
numbytes = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* if anything was inserted, put it in a byte object for xmission */
|
||||
if (0 < wireup->bytes_used) {
|
||||
opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
|
||||
/* pack the number of bytes required by payload */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* pack the byte object */
|
||||
bo.size = numbytes;
|
||||
boptr = &bo;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &boptr, 1, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* release the data since it has now been copied into our buffer */
|
||||
free(bo.bytes);
|
||||
} else {
|
||||
/* pack numbytes=0 so the unpack routine remains sync'd to us */
|
||||
numbytes = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(data, &numbytes, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(wireup);
|
||||
return rc;
|
||||
}
|
||||
/* pack a flag indicating no wireup data is provided */
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
}
|
||||
OBJ_RELEASE(wireup);
|
||||
|
||||
/* insert an "add-procs" command here so we can cleanly process it on the
|
||||
* other end
|
||||
@ -164,6 +254,13 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the flag indicating that we are not using regexps - required to
|
||||
* keep things in order when unpacking due to different ways the data
|
||||
* can get to the unpacking routine
|
||||
*/
|
||||
flag = 0;
|
||||
opal_dss.pack(data, &flag, 1, OPAL_INT8);
|
||||
|
||||
/* are we co-locating debugger daemons? */
|
||||
if (NULL != orte_debugger_daemon) {
|
||||
orte_app_context_t **apps;
|
||||
@ -306,6 +403,23 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int unpack_regexp(orte_odls_job_t **jobdat, opal_buffer_t *data)
|
||||
{
|
||||
char *regexp;
|
||||
int rc, cnt;
|
||||
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, ®exp, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, jobdat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(regexp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
{
|
||||
opal_buffer_t wireup;
|
||||
@ -313,8 +427,42 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
int rc;
|
||||
orte_std_cntr_t cnt;
|
||||
int32_t numbytes;
|
||||
int8_t flag;
|
||||
|
||||
/* extract the byte object holding the daemonmap */
|
||||
/* unpack the flag for regexp */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* if we have a regexp, then process it so we know the daemonmap */
|
||||
if (0 < flag) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:update:daemon:info updating nidmap from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
if (ORTE_SUCCESS != (rc = unpack_regexp(NULL, data))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* update the routing tree */
|
||||
if (ORTE_SUCCESS != (rc = orte_routed.update_routing_tree())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* see if we have wiring info as well */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (0 < flag) {
|
||||
/* yes - extract and process it */
|
||||
goto wireup;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* otherwise, extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -345,6 +493,18 @@ int orte_odls_base_default_update_daemon_info(opal_buffer_t *data)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* see if we have wiring info as well */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (0 == flag) {
|
||||
/* no - just return */
|
||||
return rc;
|
||||
}
|
||||
|
||||
wireup:
|
||||
/* unpack the #bytes of daemon wireup info in the message */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &numbytes, &cnt, OPAL_INT32))) {
|
||||
@ -396,14 +556,69 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
"%s odls:constructing child list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* unpack the returned data to create the required structures
|
||||
* for a fork launch. Since the data will contain information
|
||||
* on procs for ALL nodes, we first have to find the value
|
||||
* struct that contains info for our node.
|
||||
*/
|
||||
/* unpack the flag for regexp */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
|
||||
/* set the default values since they may not be included in the data */
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
if (0 < flag) {
|
||||
if (1 == flag) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: constructing jobdat from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* need to setup the job from the regexp */
|
||||
if (ORTE_SUCCESS != (rc = unpack_regexp(&jobdat, data))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* record the jobid */
|
||||
*job = jobdat->jobid;
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls: using jobdat previously extracted from regexp",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* unpack the jobid */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
/* find the corresponding jobdat */
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_odls_job_t *jdat = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdat->jobid == *job) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:construct_child_list found existing jobdat for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
|
||||
jobdat = jdat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jobdat) {
|
||||
/* we have a problem */
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
}
|
||||
/* fake an app_idx array */
|
||||
app_idx = (int8_t*)malloc(jobdat->num_procs * sizeof(int8_t));
|
||||
memset(app_idx, 0, jobdat->num_procs * sizeof(int8_t));
|
||||
/* if we are doing a timing test, store the time the msg was recvd */
|
||||
if (orte_timing) {
|
||||
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
|
||||
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
|
||||
}
|
||||
goto find_my_procs;
|
||||
}
|
||||
|
||||
/* unpack the flag - are we co-locating debugger daemons? */
|
||||
cnt=1;
|
||||
@ -451,6 +666,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
/* unpack the jobid we are to launch */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, job, &cnt, ORTE_JOBID))) {
|
||||
*job = ORTE_JOBID_INVALID;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ERROR;
|
||||
}
|
||||
@ -475,6 +691,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:construct_child_list found existing jobdat for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
|
||||
jobdat = jdat;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -487,13 +704,13 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
jobdat->jobid = *job;
|
||||
opal_list_append(&orte_local_jobdata, &jobdat->super);
|
||||
}
|
||||
|
||||
/* if we are doing a timing test, store the time the msg was recvd */
|
||||
if (orte_timing) {
|
||||
jobdat->launch_msg_recvd.tv_sec = orte_daemon_msg_recvd.tv_sec;
|
||||
jobdat->launch_msg_recvd.tv_usec = orte_daemon_msg_recvd.tv_usec;
|
||||
}
|
||||
|
||||
|
||||
/* UNPACK JOB-SPECIFIC DATA */
|
||||
/* unpack the number of nodes involved in this job */
|
||||
cnt=1;
|
||||
@ -590,6 +807,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *data,
|
||||
}
|
||||
}
|
||||
|
||||
find_my_procs:
|
||||
/* cycle through the procs and find mine */
|
||||
proc.jobid = jobdat->jobid;
|
||||
for (j=0; j < jobdat->num_procs; j++) {
|
||||
@ -1807,6 +2025,7 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
orte_std_cntr_t cnt;
|
||||
int rc;
|
||||
bool found=false;
|
||||
int8_t flag;
|
||||
|
||||
/* protect operations involving the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
@ -1888,8 +2107,25 @@ int orte_odls_base_default_require_sync(orte_process_name_t *proc,
|
||||
/* the proc needs a copy of both the daemon/node map, and
|
||||
* the process map for its peers
|
||||
*/
|
||||
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
|
||||
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
|
||||
if (NULL != jobdat->regexp) {
|
||||
/* the data is in a regexp - send that */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:sync sending regexp %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
jobdat->regexp));
|
||||
flag = 1;
|
||||
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&buffer, &jobdat->regexp, 1, OPAL_STRING);
|
||||
} else {
|
||||
/* the data is in the local byte objects - send them */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:sync sending byte object",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
flag = 0;
|
||||
opal_dss.pack(&buffer, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&buffer, &orte_odls_globals.dmap, 1, OPAL_BYTE_OBJECT);
|
||||
opal_dss.pack(&buffer, &jobdat->pmap, 1, OPAL_BYTE_OBJECT);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
|
@ -111,6 +111,7 @@ static void orte_odls_job_constructor(orte_odls_job_t *ptr)
|
||||
ptr->total_slots_alloc = 0;
|
||||
ptr->num_procs = 0;
|
||||
ptr->num_local_procs = 0;
|
||||
ptr->regexp = NULL;
|
||||
ptr->pmap = NULL;
|
||||
OBJ_CONSTRUCT(&ptr->collection_bucket, opal_buffer_t);
|
||||
OBJ_CONSTRUCT(&ptr->local_collection, opal_buffer_t);
|
||||
@ -132,6 +133,10 @@ static void orte_odls_job_destructor(orte_odls_job_t *ptr)
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != ptr->regexp) {
|
||||
free(ptr->regexp);
|
||||
}
|
||||
|
||||
if (NULL != ptr->pmap && NULL != ptr->pmap->bytes) {
|
||||
free(ptr->pmap->bytes);
|
||||
free(ptr->pmap);
|
||||
|
@ -110,6 +110,7 @@ typedef struct orte_odls_job_t {
|
||||
orte_std_cntr_t num_nodes; /* number of nodes involved in the job */
|
||||
orte_vpid_t num_procs;
|
||||
int32_t num_local_procs;
|
||||
char *regexp; /* the regular expression describing the job */
|
||||
opal_byte_object_t *pmap; /* local copy of pidmap byte object */
|
||||
opal_buffer_t collection_bucket;
|
||||
opal_buffer_t local_collection;
|
||||
|
@ -659,7 +659,14 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
* system pick any port
|
||||
*/
|
||||
opal_argv_append_nosize(&ports, "0");
|
||||
orte_static_ports = false;
|
||||
/* if static ports were specified, flag it
|
||||
* so the HNP does the right thing
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp4_static_ports) {
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
orte_static_ports = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -715,7 +722,14 @@ mca_oob_tcp_create_listen(int *target_sd, unsigned short *target_port, uint16_t
|
||||
* system pick any port
|
||||
*/
|
||||
opal_argv_append_nosize(&ports, "0");
|
||||
orte_static_ports = false;
|
||||
/* if static ports were specified, flag it
|
||||
* so the HNP does the right thing
|
||||
*/
|
||||
if (NULL != mca_oob_tcp_component.tcp6_static_ports) {
|
||||
orte_static_ports = true;
|
||||
} else {
|
||||
orte_static_ports = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* OPAL_WANT_IPV6 */
|
||||
|
@ -58,13 +58,13 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/regex.h"
|
||||
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/plm/base/plm_private.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
|
||||
static bool active_job_completed_callback = false;
|
||||
|
||||
static int orte_plm_base_report_launched(orte_jobid_t job);
|
||||
|
||||
static char *pretty_print_timing(int64_t secs, int64_t usecs);
|
||||
|
||||
int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
@ -109,6 +109,7 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
*/
|
||||
{
|
||||
char *crud;
|
||||
orte_odls_job_t *jobdat;
|
||||
crud = orte_regex_encode_maps(jdata);
|
||||
opal_output(0, "maps regex: %s", (NULL == crud) ? "NULL" : crud);
|
||||
if (NULL == crud) {
|
||||
@ -118,18 +119,30 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
orte_util_nidmap_init(NULL);
|
||||
orte_regex_decode_maps(crud);
|
||||
orte_regex_decode_maps(crud, &jobdat);
|
||||
free(crud);
|
||||
/* print-out the map */
|
||||
orte_nidmap_dump();
|
||||
orte_jobmap_dump();
|
||||
/* printout the jobdat */
|
||||
opal_output(orte_clean_output, "**** DUMP OF JOBDAT %s (%d nodes %d procs) ***",
|
||||
ORTE_JOBID_PRINT(jobdat->jobid), (int)jobdat->num_nodes, (int)(jobdat->num_procs));
|
||||
opal_output(orte_clean_output, "\tNum slots: %d\tControl: %x\tStdin: %d",
|
||||
(int)jobdat->total_slots_alloc, jobdat->controls, (int)jobdat->stdin_target);
|
||||
opal_output(orte_clean_output, "\tApp: %s", jobdat->apps[0]->app);
|
||||
opal_output(orte_clean_output, "\tCwd: %s", jobdat->apps[0]->cwd);
|
||||
crud = opal_argv_join(jobdat->apps[0]->argv, ',');
|
||||
opal_output(orte_clean_output, "\tArgv: %s", crud);
|
||||
free(crud);
|
||||
crud = opal_argv_join(jobdat->apps[0]->env, ',');
|
||||
opal_output(orte_clean_output, "\tEnv: %s", crud);
|
||||
free(crud);
|
||||
orte_never_launched = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(0);
|
||||
orte_trigger_event(&orte_exit);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
||||
|
||||
{
|
||||
opal_byte_object_t bo;
|
||||
|
||||
@ -928,7 +941,7 @@ static void app_report_launch(int status, orte_process_name_t* sender,
|
||||
|
||||
}
|
||||
|
||||
static int orte_plm_base_report_launched(orte_jobid_t job)
|
||||
int orte_plm_base_report_launched(orte_jobid_t job)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
|
@ -94,6 +94,7 @@ ORTE_DECLSPEC int orte_plm_base_set_progress_sched(int sched);
|
||||
ORTE_DECLSPEC int orte_plm_base_setup_job(orte_job_t *jdata);
|
||||
ORTE_DECLSPEC int orte_plm_base_launch_apps(orte_jobid_t job);
|
||||
ORTE_DECLSPEC void orte_plm_base_launch_failed(orte_jobid_t job, pid_t pid, int status, orte_job_state_t state);
|
||||
ORTE_DECLSPEC int orte_plm_base_report_launched(orte_jobid_t job);
|
||||
|
||||
ORTE_DECLSPEC int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons);
|
||||
|
||||
|
@ -63,6 +63,7 @@
|
||||
#include "orte/types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -159,6 +160,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
int proc_vpid_index;
|
||||
orte_jobid_t failed_job;
|
||||
bool failed_launch=true;
|
||||
bool using_regexp=false;
|
||||
|
||||
if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
|
||||
/* if this is a request to launch a local slave,
|
||||
@ -320,15 +322,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
argv[proc_vpid_index] = strdup(name_string);
|
||||
free(name_string);
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
/* Copy the prefix-directory specified in the
|
||||
corresponding app_context. If there are multiple,
|
||||
different prefix's in the app context, complain (i.e., only
|
||||
@ -369,6 +362,29 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
opal_setenv(var, "rsh", true, &env);
|
||||
free(var);
|
||||
|
||||
/* if we can do it, use the regexp to launch the apps - this
|
||||
* requires that the user requested this mode, that we were
|
||||
* provided with static ports, and that we only have one
|
||||
* app_context
|
||||
*/
|
||||
if (orte_use_regexp && orte_static_ports && jdata->num_apps < 2) {
|
||||
char *regexp;
|
||||
regexp = orte_regex_encode_maps(jdata);
|
||||
opal_argv_append(&argc, &argv, "--launch");
|
||||
opal_argv_append(&argc, &argv, regexp);
|
||||
free(regexp);
|
||||
using_regexp = true;
|
||||
}
|
||||
|
||||
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
|
||||
param = opal_argv_join(argv, ' ');
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: final top-level argv:\n\t%s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(NULL == param) ? "NULL" : param));
|
||||
if (NULL != param) free(param);
|
||||
}
|
||||
|
||||
/* exec the daemon(s) */
|
||||
if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -393,12 +409,25 @@ launch_apps:
|
||||
/* get here if daemons launch okay - any failures now by apps */
|
||||
launching_daemons = false;
|
||||
failed_job = active_job;
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
if (using_regexp) {
|
||||
/* daemons already have launch cmd - just wait for them to
|
||||
* report back
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:slurm:launch failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
"%s plm:slurm: launch of apps failed for job %s on error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* declare the launch a success */
|
||||
|
@ -276,7 +276,7 @@ void orte_daemon_cmd_processor(int fd, short event, void *data)
|
||||
orte_daemon_msg_recvd.tv_sec = mesg_recvd.tv_sec;
|
||||
orte_daemon_msg_recvd.tv_usec = mesg_recvd.tv_usec;
|
||||
}
|
||||
/* cmd contains daemon update info - process it */
|
||||
/* the cmd contains daemon update info - process it */
|
||||
if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
|
@ -193,6 +193,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Create a new xterm window and display output from the specified ranks there" },
|
||||
|
||||
{ NULL, NULL, NULL, '\0', "launch", "launch", 1,
|
||||
&orted_launch_cmd, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"A regular expression describing the job to be launched at startup" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -684,6 +688,20 @@ int orte_daemon(int argc, char *argv[])
|
||||
if (0 < orted_globals.heartbeat) {
|
||||
ORTE_TIMER_EVENT(orted_globals.heartbeat, 0, orte_plm_base_heartbeat);
|
||||
}
|
||||
|
||||
/* if we were given a launch string, then process it */
|
||||
if (NULL != orted_launch_cmd) {
|
||||
opal_buffer_t launch;
|
||||
int8_t flag;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS;
|
||||
OBJ_CONSTRUCT(&launch, opal_buffer_t);
|
||||
opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD);
|
||||
flag = 1;
|
||||
opal_dss.pack(&launch, &flag, 1, OPAL_INT8);
|
||||
opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING);
|
||||
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
||||
OBJ_DESTRUCT(&launch);
|
||||
}
|
||||
|
||||
/* wait to hear we are done */
|
||||
opal_event_dispatch();
|
||||
|
@ -64,7 +64,6 @@ bool orte_hetero_apps = false;
|
||||
bool orte_never_launched = false;
|
||||
bool orte_devel_level_output = false;
|
||||
|
||||
int32_t orte_contiguous_nodes;
|
||||
char **orte_launch_environ;
|
||||
|
||||
bool orte_hnp_is_allocated = false;
|
||||
@ -105,6 +104,8 @@ bool orte_send_profile;
|
||||
/* Nidmap and job maps */
|
||||
opal_pointer_array_t orte_nidmap;
|
||||
opal_pointer_array_t orte_jobmap;
|
||||
bool orte_use_regexp;
|
||||
char *orted_launch_cmd = NULL;
|
||||
|
||||
/* list of local children on a daemon */
|
||||
opal_list_t orte_local_children;
|
||||
|
@ -456,7 +456,6 @@ ORTE_DECLSPEC extern bool orte_leave_session_attached;
|
||||
ORTE_DECLSPEC extern bool orte_do_not_launch;
|
||||
ORTE_DECLSPEC extern bool orted_spin_flag;
|
||||
ORTE_DECLSPEC extern bool orte_static_ports;
|
||||
ORTE_DECLSPEC extern int32_t orte_contiguous_nodes;
|
||||
ORTE_DECLSPEC extern bool orte_keep_fqdn_hostnames;
|
||||
ORTE_DECLSPEC extern bool orte_show_resolved_nodenames;
|
||||
ORTE_DECLSPEC extern int orted_debug_failure;
|
||||
@ -508,6 +507,8 @@ ORTE_DECLSPEC extern bool orte_send_profile;
|
||||
/* Nidmap and job maps */
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_nidmap;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap;
|
||||
ORTE_DECLSPEC extern bool orte_use_regexp;
|
||||
ORTE_DECLSPEC extern char *orted_launch_cmd;
|
||||
|
||||
/* list of local children on a daemon */
|
||||
ORTE_DECLSPEC extern opal_list_t orte_local_children;
|
||||
|
@ -205,11 +205,12 @@ int orte_register_params(void)
|
||||
false, false, (int)false, &value);
|
||||
orte_keep_fqdn_hostnames = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* whether or not contiguous nodenames are in use */
|
||||
mca_base_param_reg_int_name("orte", "contiguous_nodes",
|
||||
"Number of nodes after which contiguous nodename encoding will automatically be used [default: INT_MAX]",
|
||||
false, false, INT32_MAX, &orte_contiguous_nodes);
|
||||
|
||||
/* whether or not to use regular expressions for launch */
|
||||
mca_base_param_reg_int_name("orte", "use_regexp",
|
||||
"Whether or not to use regular expressions for launch [default: no]",
|
||||
false, false, (int)false, &value);
|
||||
orte_use_regexp = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* whether to tag output */
|
||||
mca_base_param_reg_int_name("orte", "tag_output",
|
||||
"Tag all output with [job,rank] (default: false)",
|
||||
|
@ -353,6 +353,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Output a brief periodic report on launch progress" },
|
||||
|
||||
{ "orte", "use", "regexp", '\0', "use-regexp", "use-regexp", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Use regular expressions for launch" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
|
@ -53,6 +53,7 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
||||
@ -65,6 +66,8 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
int32_t cnt;
|
||||
int rc;
|
||||
opal_byte_object_t *bo;
|
||||
int8_t flag;
|
||||
char *regexp;
|
||||
|
||||
if (!initialized) {
|
||||
/* need to construct the global arrays */
|
||||
@ -85,6 +88,29 @@ int orte_util_nidmap_init(opal_buffer_t *buffer)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* extract the flag indicating the type of info in the buffer */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (0 < flag) {
|
||||
/* the data is a regular expression - extract and parse it
|
||||
* to get the daemonmap and process map
|
||||
*/
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, ®exp, &cnt, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_regex_decode_maps(regexp, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
free(regexp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* extract the byte object holding the daemonmap */
|
||||
cnt=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
|
||||
@ -263,15 +289,11 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
{
|
||||
orte_vpid_t *vpids;
|
||||
orte_node_t **nodes;
|
||||
char prefix[ORTE_MAX_NODE_PREFIX], *tmp;
|
||||
int32_t i, len, firstnode, lastnode, nodenum, num_nodes;
|
||||
uint8_t command = ORTE_CONTIG_NODE_CMD;
|
||||
int32_t i, num_nodes;
|
||||
uint8_t num_digs;
|
||||
uint8_t incdec;
|
||||
int rc;
|
||||
char *nodename;
|
||||
opal_buffer_t buf;
|
||||
int step;
|
||||
int32_t *arch;
|
||||
|
||||
/* setup a buffer for tmp use */
|
||||
@ -314,160 +336,25 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
}
|
||||
}
|
||||
|
||||
/* see if the cluster is configured with contiguous
|
||||
* node names and we have more than the HNP
|
||||
*/
|
||||
if (orte_contiguous_nodes < num_nodes) {
|
||||
/* discover the prefix - find first non-alpha character */
|
||||
len = strlen(nodes[1]->name);
|
||||
memset(prefix, 0, ORTE_MAX_NODE_PREFIX);
|
||||
prefix[0] = nodes[1]->name[0]; /* must start with alpha */
|
||||
for (i=1; i < len; i++) {
|
||||
if (!isalpha(nodes[1]->name[i])) {
|
||||
/* found a non-alpha char */
|
||||
if (!isdigit(nodes[1]->name[i])) {
|
||||
/* if it is anything but a digit,
|
||||
* then that's not good
|
||||
*/
|
||||
opal_output(0, "%s encode:nidmap Nodename pattern is nonstandard",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
/* okay, this defines end of the prefix.
|
||||
* convert rest of name to an offset
|
||||
*/
|
||||
firstnode = strtol(&(nodes[1]->name[i]), NULL, 10);
|
||||
/* figure out how many digits are in the index */
|
||||
for (num_digs=0; isdigit(nodes[1]->name[i+num_digs]); num_digs++);
|
||||
goto PACK;
|
||||
/* pack every nodename individually */
|
||||
for (i=1; i < num_nodes; i++) {
|
||||
if (!orte_keep_fqdn_hostnames) {
|
||||
char *ptr;
|
||||
nodename = strdup(nodes[i]->name);
|
||||
if (NULL != (ptr = strchr(nodename, '.'))) {
|
||||
*ptr = '\0';
|
||||
}
|
||||
prefix[i] = nodes[1]->name[i];
|
||||
}
|
||||
|
||||
PACK:
|
||||
/* begin encoding rest of map by indicating that this will
|
||||
* be a contiguous node map
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* pack the prefix */
|
||||
tmp = &prefix[0];
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tmp, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
len = strlen(prefix);
|
||||
|
||||
/* pack the number of digits in the index */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &num_digs, 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* and the starting offset */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &firstnode, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s encode:nidmap:contig_nodes prefix %s num_digits %d offset %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, firstnode));
|
||||
|
||||
lastnode = strtol(&(nodes[2]->name[i]), NULL, 10);
|
||||
if ((lastnode - firstnode) < 0) {
|
||||
/* we are decrementing */
|
||||
incdec = 0;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(nodename);
|
||||
} else {
|
||||
/* we are incrementing */
|
||||
incdec = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &incdec, 1, OPAL_INT8))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodes[i]->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
lastnode = firstnode;
|
||||
/* cycle through the nodes - pack the starting offset
|
||||
* and total number of nodes in each contiguous range
|
||||
*/
|
||||
for (i=2; i < num_nodes; i++) {
|
||||
nodenum = strtol(&(nodes[i]->name[len]), NULL, 10);
|
||||
step = nodenum -lastnode;
|
||||
if (step < 0) {
|
||||
/* we are decrementing */
|
||||
step = lastnode - nodenum;
|
||||
}
|
||||
if (step > 1) {
|
||||
/* have a break - indicate end of range */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* indicate start of new range */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodenum, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s encode:nidmap:contig_nodes end range %d start next range %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode, nodenum));
|
||||
}
|
||||
lastnode = nodenum;
|
||||
}
|
||||
/* pack end of range */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s encode:nidmap:contig_nodes end range %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), lastnode));
|
||||
/* pack flag end of ranges */
|
||||
lastnode = -1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &lastnode, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
/* if the nodes aren't contiguous, then we need
|
||||
* to simply pack every nodename individually
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s encode:nidmap non_contig_nodes - packing all names",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* indicate that this will not be a contiguous node map */
|
||||
command = ORTE_NON_CONTIG_NODE_CMD;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
for (i=1; i < num_nodes; i++) {
|
||||
if (!orte_keep_fqdn_hostnames) {
|
||||
char *ptr;
|
||||
nodename = strdup(nodes[i]->name);
|
||||
if (NULL != (ptr = strchr(nodename, '.'))) {
|
||||
*ptr = '\0';
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodename, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
free(nodename);
|
||||
} else {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &nodes[i]->name, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* since the daemon vpids may not correspond to the node
|
||||
@ -588,15 +475,12 @@ int orte_util_encode_nodemap(opal_byte_object_t *boptr)
|
||||
|
||||
int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
{
|
||||
int n, loc, k, diglen, namelen;
|
||||
char *prefix, digits[10];
|
||||
int32_t num_nodes, lastnode, endrange, i, num_daemons;
|
||||
int n;
|
||||
int32_t num_nodes, i, num_daemons;
|
||||
orte_nid_t *node;
|
||||
orte_vpid_t *vpids;
|
||||
uint8_t command, num_digs;
|
||||
uint8_t num_digs;
|
||||
orte_nid_t **nd, *ndptr;
|
||||
uint8_t incdec;
|
||||
int32_t index, step;
|
||||
int32_t *arch;
|
||||
opal_buffer_t buf;
|
||||
opal_byte_object_t *boptr;
|
||||
@ -655,123 +539,22 @@ int orte_util_decode_nodemap(opal_byte_object_t *bo)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack flag to see if this is a contiguous node map or not */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &command, &n, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
if (ORTE_CONTIG_NODE_CMD == command) {
|
||||
/* unpack the prefix */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &prefix, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* the number of digits in the index */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_digs, &n, OPAL_UINT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* and the starting offset */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack increment/decrement flag */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &incdec, &n, OPAL_INT8))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* unpack the end of the range */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* setup loop params */
|
||||
if (0 == incdec) {
|
||||
endrange -= 1;
|
||||
step = -1;
|
||||
} else {
|
||||
endrange += 1;
|
||||
step = 1;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s decode:nidmap:contig_nodes prefix %s num_digits %d offset %d endrange %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), prefix, num_digs, lastnode, endrange));
|
||||
|
||||
namelen = strlen(prefix) + num_digs + 1;
|
||||
/* cycle through the ranges */
|
||||
index = 1;
|
||||
while (1) {
|
||||
for (i=lastnode; i != endrange; i += step) {
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
/* allocate space for the nodename */
|
||||
node->name = (char*)malloc(namelen);
|
||||
memset(node->name, 0, namelen);
|
||||
loc = snprintf(node->name, namelen, "%s", prefix);
|
||||
diglen = num_digs - snprintf(digits, 10, "%d", i);
|
||||
for (k=0; k < diglen && loc < namelen; k++) {
|
||||
node->name[loc] = '0';
|
||||
loc++;
|
||||
}
|
||||
strncat(node->name, digits, num_digs);
|
||||
/* the arch defaults to our arch so that non-hetero
|
||||
* case will yield correct behavior
|
||||
*/
|
||||
opal_pointer_array_set_item(&orte_nidmap, index, node);
|
||||
index++;
|
||||
}
|
||||
/* unpack start of new range */
|
||||
n=1;
|
||||
opal_dss.unpack(&buf, &lastnode, &n, OPAL_INT32);
|
||||
/* if that is -1, then it flags no more ranges */
|
||||
if (-1 == lastnode) {
|
||||
goto process_daemons;
|
||||
}
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &endrange, &n, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (0 == incdec) {
|
||||
endrange -= 1;
|
||||
} else {
|
||||
endrange += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* not contiguous - just loop over nodes and
|
||||
* unpack the raw nodename
|
||||
/* loop over nodes and unpack the raw nodename */
|
||||
for (i=1; i < num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
/* the arch defaults to our arch so that non-hetero
|
||||
* case will yield correct behavior
|
||||
*/
|
||||
for (i=1; i < num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
/* the arch defaults to our arch so that non-hetero
|
||||
* case will yield correct behavior
|
||||
*/
|
||||
opal_pointer_array_set_item(&orte_nidmap, i, node);
|
||||
|
||||
/* unpack the node's name */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
opal_pointer_array_set_item(&orte_nidmap, i, node);
|
||||
|
||||
/* unpack the node's name */
|
||||
n=1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(node->name), &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
process_daemons:
|
||||
/* unpack the daemon names */
|
||||
vpids = (orte_vpid_t*)malloc(num_nodes * sizeof(orte_vpid_t));
|
||||
n=num_nodes;
|
||||
@ -1141,7 +924,7 @@ cleanup:
|
||||
orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
|
||||
{
|
||||
int i;
|
||||
orte_jmap_t **jmaps;
|
||||
orte_jmap_t *jmap;
|
||||
|
||||
/* unfortunately, job objects cannot be stored
|
||||
* by index number as the jobid is a constructed
|
||||
@ -1151,17 +934,16 @@ orte_jmap_t* orte_util_lookup_jmap(orte_jobid_t job)
|
||||
* left-justified as cleanup is done - and array
|
||||
* entries set to NULL - upon job completion.
|
||||
*/
|
||||
jmaps = (orte_jmap_t**)orte_jobmap.addr;
|
||||
for (i=0; i < orte_jobmap.size; i++) {
|
||||
if (NULL == jmaps[i]) {
|
||||
if (NULL == (jmap = (orte_jmap_t*)opal_pointer_array_get_item(&orte_jobmap, i))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
|
||||
"%s lookup:pmap: checking job %s for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jmaps[i]->job), ORTE_JOBID_PRINT(job)));
|
||||
if (job == jmaps[i]->job) {
|
||||
return jmaps[i];
|
||||
ORTE_JOBID_PRINT(jmap->job), ORTE_JOBID_PRINT(job)));
|
||||
if (job == jmap->job) {
|
||||
return jmap;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1192,19 +974,18 @@ orte_pmap_t* orte_util_lookup_pmap(orte_process_name_t *proc)
|
||||
static orte_nid_t* find_daemon_node(orte_process_name_t *proc)
|
||||
{
|
||||
int32_t i;
|
||||
orte_nid_t **nids;
|
||||
orte_nid_t *nid;
|
||||
|
||||
nids = (orte_nid_t**)orte_nidmap.addr;
|
||||
for (i=0; i < orte_nidmap.size; i++) {
|
||||
if (NULL == nids[i]) {
|
||||
if (NULL == (nid = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
||||
continue;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_debug_output,
|
||||
"%s find:daemon:node: checking daemon %s for %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_VPID_PRINT(nids[i]->daemon), ORTE_VPID_PRINT(proc->vpid)));
|
||||
if (nids[i]->daemon == proc->vpid) {
|
||||
return nids[i];
|
||||
ORTE_VPID_PRINT(nid->daemon), ORTE_VPID_PRINT(proc->vpid)));
|
||||
if (nid->daemon == proc->vpid) {
|
||||
return nid;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,30 @@
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYS_SOCKET_H
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETINET_IN_H
|
||||
#include <netinet/in.h>
|
||||
#endif
|
||||
#ifdef HAVE_ARPA_INET_H
|
||||
#include <arpa/inet.h>
|
||||
#endif
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_IFADDRS_H
|
||||
#include <ifaddrs.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/util/regex.h"
|
||||
@ -445,19 +463,23 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
|
||||
char prefix[ORTE_MAX_NODE_PREFIX];
|
||||
int startnum;
|
||||
opal_list_item_t *item;
|
||||
char **regexargs = NULL, *tmp;
|
||||
char **regexargs = NULL, *tmp, *tmp2;
|
||||
int32_t num_nodes, start, cnt, ppn, nppn;
|
||||
orte_vpid_t vpid_start, start_vpid, end_vpid, base;
|
||||
char *regexp = NULL;
|
||||
bool byslot;
|
||||
orte_node_rank_t node_rank, nrank;
|
||||
char suffix, sfx;
|
||||
orte_app_context_t *app;
|
||||
|
||||
/* this is only supported with regular maps - i.e., when
|
||||
* the mapping is byslot or bynode. Irregular maps cannot
|
||||
* be expressed in a regular expression
|
||||
*
|
||||
* Also only supported for one app_context
|
||||
*/
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER) {
|
||||
if (jdata->map->policy & ORTE_RMAPS_BYUSER ||
|
||||
jdata->num_apps > 1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -598,6 +620,39 @@ char* orte_regex_encode_maps(orte_job_t *jdata)
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* next comes the total slots allocated to us */
|
||||
asprintf(&tmp, "SLOTS=%d", (int)jdata->total_slots_alloc);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* the control flags for this job */
|
||||
asprintf(&tmp, "CTRLS=%d", (int)jdata->controls);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* the stdin target for the job */
|
||||
asprintf(&tmp, "STDIN=%d", (int)jdata->stdin_target);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* the app_context for the job - can only be one! Just include
|
||||
* the required portions
|
||||
*/
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
|
||||
asprintf(&tmp, "APP=\"%s:%s\"", app->app, app->cwd);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
tmp2 = opal_argv_join(app->argv, '#');
|
||||
asprintf(&tmp, "ARGV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
|
||||
free(tmp2);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
tmp2 = opal_argv_join(app->env, '#');
|
||||
asprintf(&tmp, "ENV=\"%s\"", (NULL == tmp2) ? "NULL" : tmp2);
|
||||
free(tmp2);
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
free(tmp);
|
||||
|
||||
/* next comes the starting daemon vpid */
|
||||
asprintf(&tmp, "DVPID=%s", ORTE_VPID_PRINT(jdata->map->daemon_vpid_start));
|
||||
opal_argv_append_nosize(®exargs, tmp);
|
||||
@ -855,10 +910,10 @@ cleanup:
|
||||
return rc;
|
||||
}
|
||||
|
||||
int orte_regex_decode_maps(char *regexp)
|
||||
int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat)
|
||||
{
|
||||
char **seqs, *ptr, **names;
|
||||
int i, j, k, n, rc;
|
||||
char **seqs, *ptr, **names, *ptr2, check[5];
|
||||
int i, j, k, n, entry, rc;
|
||||
int ppn, step, start_nrank, nrank;
|
||||
int32_t tmp32;
|
||||
orte_vpid_t daemon_vpid, vpid;
|
||||
@ -867,27 +922,46 @@ int orte_regex_decode_maps(char *regexp)
|
||||
orte_jmap_t *jmap;
|
||||
orte_pmap_t *pmap;
|
||||
bool found;
|
||||
|
||||
orte_odls_job_t *jdat;
|
||||
orte_app_context_t *app;
|
||||
opal_list_item_t *item;
|
||||
int num_procs, num_nodes;
|
||||
struct hostent *h;
|
||||
opal_buffer_t buf;
|
||||
char *uri, *addr;
|
||||
orte_process_name_t proc;
|
||||
char *proc_name;
|
||||
bool hnp_entry;
|
||||
|
||||
/* if regexp is NULL, then nothing to parse */
|
||||
if (NULL == regexp) {
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
|
||||
/* ensure the global nidmap/pidmap arrays are initialized */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* break the regexp into its component parts - this is trivial
|
||||
* because they are all separated by commas!
|
||||
*/
|
||||
seqs = opal_argv_split(regexp, ',');
|
||||
|
||||
/* we need to have at least three elements or something is wrong */
|
||||
if (opal_argv_count(seqs) < 3) {
|
||||
/* we need to have at least six elements or something is wrong */
|
||||
if (opal_argv_count(seqs) < 6) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* start parsing with the first entry */
|
||||
entry=0;
|
||||
|
||||
/* the first entry is the local jobid, so we extract that and
|
||||
* convert it into a global jobid
|
||||
*/
|
||||
ptr = strchr(seqs[0], '=');
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
@ -915,8 +989,146 @@ int orte_regex_decode_maps(char *regexp)
|
||||
opal_pointer_array_add(&orte_jobmap, jmap);
|
||||
}
|
||||
|
||||
/* the second entry is the starting daemon vpid for the job being launched */
|
||||
ptr = strchr(seqs[0], '=');
|
||||
jdat = NULL;
|
||||
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||
/* even though we are unpacking an add_local_procs cmd, we cannot assume
|
||||
* that no job record for this jobid exists. A race condition exists that
|
||||
* could allow another daemon's procs to call us with a collective prior
|
||||
* to our unpacking add_local_procs. So lookup the job record for this jobid
|
||||
* and see if it already exists
|
||||
*/
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
orte_odls_job_t *jdt = (orte_odls_job_t*)item;
|
||||
|
||||
/* is this the specified job? */
|
||||
if (jdt->jobid == jobid) {
|
||||
jdat = jdt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL == jdat) {
|
||||
/* setup jobdat object for this job */
|
||||
jdat = OBJ_NEW(orte_odls_job_t);
|
||||
jdat->jobid = jobid;
|
||||
opal_list_append(&orte_local_jobdata, &jdat->super);
|
||||
}
|
||||
if (NULL != jobdat) {
|
||||
*jobdat = jdat;
|
||||
}
|
||||
/* see if this was previously decoded */
|
||||
if (NULL != jdat->regexp) {
|
||||
/* yep - don't decode it again */
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* next entry is the total slots allocated to this job */
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
jdat->total_slots_alloc = strtol(ptr, NULL, 10);
|
||||
|
||||
/* next entry is the control flags for the job */
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
jdat->controls = strtol(ptr, NULL, 10);
|
||||
|
||||
/* next entry - stdin target */
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
jdat->stdin_target = strtol(ptr, NULL, 10);
|
||||
|
||||
/* next entry - the app_context itself */
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
/* some shells will strip the starting and ending quotes, and some won't -
|
||||
* so check for them here
|
||||
*/
|
||||
if ('\"' == *ptr) ptr++;
|
||||
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
|
||||
/* create the app_context object */
|
||||
app = OBJ_NEW(orte_app_context_t);
|
||||
jdat->apps = (orte_app_context_t**)malloc(sizeof(orte_app_context_t*));
|
||||
jdat->apps[0] = app;
|
||||
jdat->num_apps = 1;
|
||||
/* get the app and the cwd by hand */
|
||||
ptr2 = strchr(ptr, ':');
|
||||
*ptr2 = '\0';
|
||||
app->app = strdup(ptr);
|
||||
ptr = ++ptr2;
|
||||
app->cwd = strdup(ptr);
|
||||
|
||||
/* the next entry is the argv for the app_context, separated by '#'. We
|
||||
* assume we can use argv_split for this purpose. First check, though, for
|
||||
* NULL, indicating there were no argvs
|
||||
*/
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
/* some shells will strip the starting and ending quotes, and some won't -
|
||||
* so check for them here
|
||||
*/
|
||||
if ('\"' == *ptr) ptr++;
|
||||
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
|
||||
for (i=0; i < 4; i++) {
|
||||
check[i] = ptr[i];
|
||||
}
|
||||
check[4] = '\0';
|
||||
if (0 != strcmp("NULL", check)) {
|
||||
/* there are argvs */
|
||||
app->argv = opal_argv_split(ptr, '#');
|
||||
|
||||
}
|
||||
|
||||
/* the next entry is the env for the app_context, also separated by '#'.
|
||||
* Again, start by checking for NULL
|
||||
*/
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
ptr++;
|
||||
/* some shells will strip the starting and ending quotes, and some won't -
|
||||
* so check for them here
|
||||
*/
|
||||
if ('\"' == *ptr) ptr++;
|
||||
if ('\"' == ptr[strlen(ptr)-1]) ptr[strlen(ptr)-1] = '\0';
|
||||
for (i=0; i < 4; i++) {
|
||||
check[i] = ptr[i];
|
||||
}
|
||||
check[4] = '\0';
|
||||
if (0 != strcmp("NULL", check)) {
|
||||
/* there are argvs */
|
||||
app->env = opal_argv_split(ptr, '#');
|
||||
}
|
||||
|
||||
} else {
|
||||
entry += 6;
|
||||
}
|
||||
|
||||
/* next entry is the starting daemon vpid for the job being launched */
|
||||
ptr = strchr(seqs[entry++], '=');
|
||||
if (NULL == ptr) {
|
||||
opal_argv_free(seqs);
|
||||
return ORTE_ERROR;
|
||||
@ -930,14 +1142,18 @@ int orte_regex_decode_maps(char *regexp)
|
||||
opal_argv_free(seqs);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* the remaining entries contain the name of the nodes in the system, how
|
||||
* many procs (if any) on each of those nodes, the starting vpid of the
|
||||
* procs on those nodes, and the starting node rank for the procs on
|
||||
* each node
|
||||
*/
|
||||
names = NULL;
|
||||
for (n=2; n < opal_argv_count(seqs); n++) {
|
||||
num_procs = 0;
|
||||
num_nodes = 0;
|
||||
hnp_entry = true;
|
||||
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
||||
for (n=entry; n < opal_argv_count(seqs); n++) {
|
||||
/* parse the node entry to get a list of all node names in it */
|
||||
if (ORTE_SUCCESS != (rc = parse_node_range(seqs[n], &names, &vpid, &ppn, &step, &start_nrank))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -963,22 +1179,64 @@ int orte_regex_decode_maps(char *regexp)
|
||||
nid->name = strdup(names[i]);
|
||||
nid->index = opal_pointer_array_add(&orte_nidmap, nid);
|
||||
}
|
||||
/* are there any procs on this node? */
|
||||
if (ORTE_VPID_INVALID != vpid) {
|
||||
/* is this the hnp entry (very first one), or are there any procs on this node? */
|
||||
if (hnp_entry || ORTE_VPID_INVALID != vpid) {
|
||||
/* yep - add a daemon if we don't already one, otherwise
|
||||
* this is just adding procs to an existing daemon
|
||||
*/
|
||||
if (ORTE_VPID_INVALID != daemon_vpid &&
|
||||
ORTE_VPID_INVALID == nid->daemon) {
|
||||
/* no daemon assigned yet - add it */
|
||||
nid->daemon = daemon_vpid;
|
||||
daemon_vpid++;
|
||||
if (hnp_entry) {
|
||||
/* the hnp is always daemon=0 */
|
||||
nid->daemon = 0;
|
||||
hnp_entry = false; /* only do this once */
|
||||
} else {
|
||||
nid->daemon = daemon_vpid++;
|
||||
}
|
||||
/* if we are using static ports, create the contact info
|
||||
* for the daemon on this node
|
||||
*/
|
||||
if (orte_static_ports) {
|
||||
/* lookup the address of this node */
|
||||
if (NULL == (h = gethostbyname(nid->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
addr = inet_ntoa(*(struct in_addr*)h->h_addr_list[0]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((0, orte_debug_output,
|
||||
"%s orte:regex: constructing static path to node %s daemon %d addr %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
nid->name, (int)nid->daemon, addr));
|
||||
|
||||
/* since we are using static ports, all my fellow daemons will be on my
|
||||
* port. Setup the contact info for each daemon in my hash tables. Note
|
||||
* that this will -not- open a port to those daemons, but will only
|
||||
* define the info necessary for opening such a port if/when I communicate
|
||||
* to them
|
||||
*/
|
||||
/* construct the URI */
|
||||
proc.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
proc.vpid = nid->daemon;
|
||||
orte_util_convert_process_name_to_string(&proc_name, &proc);
|
||||
asprintf(&uri, "%s;tcp://%s:%d", proc_name, addr, (int)orte_process_info.my_port);
|
||||
opal_dss.pack(&buf, &uri, 1, OPAL_STRING);
|
||||
free(proc_name);
|
||||
free(uri);
|
||||
}
|
||||
}
|
||||
|
||||
/* cycle through the ppn, adding a pmap
|
||||
* for each new rank
|
||||
*/
|
||||
nrank = start_nrank;
|
||||
for (k=0; k < ppn; k++) {
|
||||
if (NULL != opal_pointer_array_get_item(&jmap->pmap, vpid)) {
|
||||
/* this proc was already entered via some earlier step */
|
||||
vpid += step;
|
||||
continue;
|
||||
}
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
pmap->node = nid->index;
|
||||
pmap->local_rank = k;
|
||||
@ -986,12 +1244,35 @@ int orte_regex_decode_maps(char *regexp)
|
||||
jmap->num_procs++;
|
||||
opal_pointer_array_set_item(&jmap->pmap, vpid, pmap);
|
||||
vpid += step;
|
||||
/* increment #procs in the job */
|
||||
num_procs++;
|
||||
}
|
||||
/* increment #nodes in the job */
|
||||
num_nodes++;
|
||||
}
|
||||
}
|
||||
opal_argv_free(names);
|
||||
names = NULL;
|
||||
}
|
||||
|
||||
/* if we are using static ports, load the hash tables */
|
||||
if (orte_static_ports) {
|
||||
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&buf);
|
||||
|
||||
|
||||
opal_argv_free(seqs);
|
||||
|
||||
if (NULL != jdat) {
|
||||
/* record the regexp so it can be sent to the local procs */
|
||||
jdat->regexp = strdup(regexp);
|
||||
/* save the job data */
|
||||
jdat->num_procs += num_procs;
|
||||
jdat->num_nodes += num_nodes;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -25,6 +25,7 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -35,7 +36,7 @@ ORTE_DECLSPEC int orte_regex_extract_ppn(int num_nodes, char *regexp, int **ppn)
|
||||
|
||||
ORTE_DECLSPEC char* orte_regex_encode_maps(orte_job_t *jdata);
|
||||
|
||||
ORTE_DECLSPEC int orte_regex_decode_maps(char *regexp);
|
||||
ORTE_DECLSPEC int orte_regex_decode_maps(char *regexp, orte_odls_job_t **jobdat);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user