From 063e4c9989b230b8e01274ef1d38373e53098a76 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 10 Feb 2015 08:27:13 -0800 Subject: [PATCH] Cleanup the pretty-print of odls cmds as some were missing. Add a new cmd to terminate the DVM, which the HNP will use to trun around and issue an xcast to the DVM. --- orte/mca/odls/odls_types.h | 3 +- orte/orted/orted_comm.c | 68 +++++++++++++++++++++++----- orte/tools/orte-dvm/orte-dvm.c | 13 +++++- orte/tools/orte-submit/orte-submit.c | 2 +- 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index 2bfa427d4e..b20a4d686e 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -58,7 +58,8 @@ typedef uint8_t orte_daemon_cmd_flag_t; #define ORTE_DAEMON_SPAWN_JOB_CMD (orte_daemon_cmd_flag_t) 17 #define ORTE_DAEMON_TERMINATE_JOB_CMD (orte_daemon_cmd_flag_t) 18 #define ORTE_DAEMON_HALT_VM_CMD (orte_daemon_cmd_flag_t) 19 - +#define ORTE_DAEMON_HALT_DVM_CMD (orte_daemon_cmd_flag_t) 20 + /* request proc resource usage */ #define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 22 diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index a8a2985ed4..aa7356d56d 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -114,7 +114,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, orte_proc_t *cur_proc = NULL, *prev_proc = NULL; bool found = false; orte_node_t *node; - + orte_grpcomm_signature_t *sig; + /* unpack the command */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { @@ -459,6 +460,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, return; break; + /**** HALT VM COMMAND ****/ case ORTE_DAEMON_HALT_VM_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received halt_vm cmd", @@ -491,6 +493,27 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, return; break; + /**** HALT DVM COMMAND ****/ + case ORTE_DAEMON_HALT_DVM_CMD: + if (orte_debug_daemons_flag) { + opal_output(0, "%s orted_cmd: received halt_dvm cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + } + /* we just need to xcast the HALT_VM cmd out, which will send + * it back into us */ + answer = OBJ_NEW(opal_buffer_t); + command = ORTE_DAEMON_HALT_VM_CMD; + opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD); + sig = OBJ_NEW(orte_grpcomm_signature_t); + sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); + sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; + sig->signature[0].vpid = ORTE_VPID_WILDCARD; + orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, answer); + OBJ_RELEASE(answer); + OBJ_RELEASE(sig); + return; + break; + /**** SPAWN JOB COMMAND ****/ case ORTE_DAEMON_SPAWN_JOB_CMD: if (orte_debug_daemons_flag) { @@ -1127,8 +1150,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, static char *get_orted_comm_cmd_str(int command) { switch(command) { - case ORTE_DAEMON_NULL_CMD: - return strdup("NULL"); + case ORTE_DAEMON_CONTACT_QUERY_CMD: + return strdup("ORTE_DAEMON_CONTACT_QUERY_CMD"); case ORTE_DAEMON_KILL_LOCAL_PROCS: return strdup("ORTE_DAEMON_KILL_LOCAL_PROCS"); case ORTE_DAEMON_SIGNAL_LOCAL_PROCS: @@ -1137,26 +1160,49 @@ static char *get_orted_comm_cmd_str(int command) return strdup("ORTE_DAEMON_ADD_LOCAL_PROCS"); case ORTE_DAEMON_TREE_SPAWN: return strdup("ORTE_DAEMON_TREE_SPAWN"); + + case ORTE_DAEMON_HEARTBEAT_CMD: + return strdup("ORTE_DAEMON_HEARTBEAT_CMD"); + case ORTE_DAEMON_EXIT_CMD: + return strdup("ORTE_DAEMON_EXIT_CMD"); + case ORTE_DAEMON_PROCESS_AND_RELAY_CMD: + return strdup("ORTE_DAEMON_PROCESS_AND_RELAY_CMD"); case ORTE_DAEMON_MESSAGE_LOCAL_PROCS: return strdup("ORTE_DAEMON_MESSAGE_LOCAL_PROCS"); - case ORTE_DAEMON_EXIT_CMD: - return strdup("ORTE_DAEMON_EXIT_CMD"); - case ORTE_DAEMON_SPAWN_JOB_CMD: - return strdup("ORTE_DAEMON_SPAWN_JOB_CMD"); - case ORTE_DAEMON_CONTACT_QUERY_CMD: - return strdup("ORTE_DAEMON_CONTACT_QUERY_CMD"); + case ORTE_DAEMON_NULL_CMD: + return strdup("NULL"); + case ORTE_DAEMON_REPORT_JOB_INFO_CMD: return strdup("ORTE_DAEMON_REPORT_JOB_INFO_CMD"); case ORTE_DAEMON_REPORT_NODE_INFO_CMD: return strdup("ORTE_DAEMON_REPORT_NODE_INFO_CMD"); case ORTE_DAEMON_REPORT_PROC_INFO_CMD: return strdup("ORTE_DAEMON_REPORT_PROC_INFO_CMD"); - case ORTE_DAEMON_HEARTBEAT_CMD: - return strdup("ORTE_DAEMON_HEARTBEAT_CMD"); + case ORTE_DAEMON_SPAWN_JOB_CMD: + return strdup("ORTE_DAEMON_SPAWN_JOB_CMD"); + case ORTE_DAEMON_TERMINATE_JOB_CMD: + return strdup("ORTE_DAEMON_TERMINATE_JOB_CMD"); + + case ORTE_DAEMON_HALT_VM_CMD: + return strdup("ORTE_DAEMON_HALT_VM_CMD"); + case ORTE_DAEMON_HALT_DVM_CMD: + return strdup("ORTE_DAEMON_HALT_DVM_CMD"); case ORTE_DAEMON_TOP_CMD: return strdup("ORTE_DAEMON_TOP_CMD"); + case ORTE_DAEMON_NAME_REQ_CMD: + return strdup("ORTE_DAEMON_NAME_REQ_CMD"); + case ORTE_DAEMON_CHECKIN_CMD: + return strdup("ORTE_DAEMON_CHECKIN_CMD"); + + case ORTE_TOOL_CHECKIN_CMD: + return strdup("ORTE_TOOL_CHECKIN_CMD"); + case ORTE_DAEMON_PROCESS_CMD: + return strdup("ORTE_DAEMON_PROCESS_CMD"); case ORTE_DAEMON_ABORT_PROCS_CALLED: return strdup("ORTE_DAEMON_ABORT_PROCS_CALLED"); + case ORTE_DAEMON_NEW_COLL_ID: + return strdup("ORTE_DAEMON_NEW_COLL_ID"); + default: return strdup("Unknown Command!"); } diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index 5b5e859857..993a2d1e8f 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -138,7 +138,7 @@ int main(int argc, char *argv[]) char *param, *value; orte_job_t *jdata=NULL; orte_app_context_t *app; - char *uri; + char *uri, *ptr; /* Setup and parse the command line */ memset(&myglobals, 0, sizeof(myglobals)); @@ -250,6 +250,17 @@ int main(int argc, char *argv[]) } else if (0 == strcmp(myglobals.report_uri, "+")) { /* if '+', output to stderr */ fprintf(stderr, "VMURI: %s\n", uri); + } else if (0 == strncasecmp(myglobals.report_uri, "file:", strlen("file:"))) { + ptr = strchr(myglobals.report_uri, ':'); + ++ptr; + fp = fopen(ptr, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + myglobals.basename, "pid", ptr); + exit(0); + } + fprintf(fp, "%s\n", uri); + fclose(fp); } else { fp = fopen(myglobals.report_uri, "w"); if (NULL == fp) { diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c index a256afb195..47964df34d 100644 --- a/orte/tools/orte-submit/orte-submit.c +++ b/orte/tools/orte-submit/orte-submit.c @@ -487,7 +487,7 @@ int main(int argc, char *argv[]) /* if this is the terminate command, just send it */ if (myglobals.terminate) { opal_buffer_t *buf; - orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_HALT_VM_CMD; + orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_HALT_DVM_CMD; buf = OBJ_NEW(opal_buffer_t); opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,