/* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include "orte/constants.h" #include #include #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_NETDB_H #include #endif #ifdef HAVE_SYS_PARAM_H #include #endif #include #include #include #include "opal/class/opal_pointer_array.h" #include "opal/event/event.h" #include "opal/mca/base/base.h" #include "opal/threads/mutex.h" #include "opal/threads/condition.h" #include "opal/util/bit_ops.h" #include "opal/util/cmd_line.h" #include "opal/util/daemon_init.h" #include "opal/util/opal_environ.h" #include "opal/util/os_path.h" #include "opal/util/printf.h" #include "opal/util/trace.h" #include "opal/util/argv.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_progress.h" #include "opal/mca/base/mca_base_param.h" #include "opal/dss/dss.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/name_fns.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/odls/odls.h" #include "orte/mca/odls/base/base.h" #include "orte/mca/plm/plm.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/routed/routed.h" #include "orte/mca/ess/ess.h" #include "orte/mca/odls/base/odls_private.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/orted/orted.h" /* * Globals */ static int process_commands(orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag); /* local callback function for non-blocking sends */ static void send_callback(int status, orte_process_name_t *peer, opal_buffer_t *buf, orte_rml_tag_t tag, void *cbdata) { /* nothing to do here - just release buffer and return */ OBJ_RELEASE(buf); } static void send_relay(opal_buffer_t *buf, orte_jobid_t target_job, orte_rml_tag_t tag) { opal_buffer_t *buffer = NULL; opal_list_t recips; opal_list_item_t *item; orte_namelist_t *nm; int ret; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the list of next recipients from the routed module */ OBJ_CONSTRUCT(&recips, opal_list_t); /* ignore returned parent vpid - we don't care here */ orte_routed.get_routing_tree(target_job, &recips); /* if list is empty, nothing for us to do */ if (opal_list_is_empty(&recips)) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay - recipient list is empty!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CLEANUP; } /* get the first recipient so we can look at it */ item = opal_list_get_first(&recips); nm = (orte_namelist_t*)item; /* check to see if this message is going directly to the * target jobid and that jobid is not my own */ if (nm->name.jobid == target_job && target_job != ORTE_PROC_MY_NAME->jobid) { orte_daemon_cmd_flag_t command; orte_jobid_t job; orte_rml_tag_t msg_tag; int32_t n; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay sending directly to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(nm->name.jobid))); /* this is going directly to the job, and not being * relayed any further. We need to remove the process-and-relay * command and the target jobid/tag from the buffer so the * recipients can correctly process it */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &msg_tag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* if this isn't going to the daemon tag, then we have more to extract */ if (ORTE_RML_TAG_DAEMON != tag) { /* remove the message_local_procs cmd data */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buf, &msg_tag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } buffer = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(buffer, buf); } else { /* buffer is already setup - just point to it */ buffer = buf; /* tag needs to be set to daemon_tag */ tag = ORTE_RML_TAG_DAEMON; } /* if the list has only one entry, and that entry has a wildcard * vpid, then we will handle it separately */ if (1 == opal_list_get_size(&recips) && nm->name.vpid == ORTE_VPID_WILDCARD) { /* okay, this is a wildcard case. First, look up the #procs in the * specified job - only the HNP can do this. Fortunately, the routed * modules are smart enough not to ask a remote daemon to do it! * However, just to be safe, in case some foolish future developer * doesn't get that logic right... ;-) */ orte_job_t *jdata; orte_vpid_t i; orte_process_name_t target; if (!orte_process_info.hnp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); goto CLEANUP; } if (NULL == (jdata = orte_get_job_data_object(nm->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_ERR_NOT_FOUND; goto CLEANUP; } /* send the buffer to all members of the specified job */ target.jobid = nm->name.jobid; for (i=0; i < jdata->num_procs; i++) { if (target.jobid == ORTE_PROC_MY_NAME->jobid && i == ORTE_PROC_MY_NAME->vpid) { /* do not send to myself! */ continue; } target.vpid = i; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay sending relay msg to %s tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&target), tag)); if (0 > (ret = orte_rml.send_buffer(&target, buffer, tag, 0))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } } else { /* send the message to each recipient on list, deconstructing it as we go */ while (NULL != (item = opal_list_remove_first(&recips))) { nm = (orte_namelist_t*)item; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:send_relay sending relay msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name))); if (0 > (ret = orte_rml.send_buffer(&nm->name, buffer, tag, 0))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } } CLEANUP: /* cleanup */ OBJ_DESTRUCT(&recips); if (NULL != buffer && buffer != buf) { OBJ_RELEASE(buffer); } } void orte_daemon_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted_recv_cmd: received message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* don't process this right away - we need to get out of the recv before * we process the message as it may ask us to do something that involves * more messaging! Instead, setup an event so that the message gets processed * as soon as we leave the recv. * * The macro makes a copy of the buffer, which we release when processed - the incoming * buffer, however, is NOT released here, although its payload IS transferred * to the message buffer for later processing */ ORTE_MESSAGE_EVENT(sender, buffer, tag, orte_daemon_cmd_processor); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted_recv_cmd: reissued recv", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } static int num_recursions=0; static int wait_time=1; #define MAX_RECURSIONS 24 void orte_daemon_cmd_processor(int fd, short event, void *data) { orte_message_event_t *mev = (orte_message_event_t*)data; orte_process_name_t *sender = &(mev->sender); opal_buffer_t *buffer = mev->buffer; orte_rml_tag_t tag = mev->tag, target_tag; orte_jobid_t job; int ret; char *unpack_ptr, *save; orte_std_cntr_t n; orte_daemon_cmd_flag_t command; /* check to see if we are in a progress recursion */ if (orte_process_info.daemon && 1 < (ret = opal_progress_recursion_depth())) { /* if we are in a recursion, we want to repost the message event * so the progress engine can work its way back up to the top * of the stack. Given that this could happen multiple times, * we have to be careful to increase the time we wait so that * we provide enough time - but not more time than necessary - for * the stack to clear */ OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:cmd:processor in recursion depth %d\n\treposting %s for tag %ld", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret, ORTE_NAME_PRINT(sender), (long)(tag))); if (MAX_RECURSIONS < num_recursions) { /* we need to abort if we get too far down this path */ opal_output(0, "%s ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); OBJ_RELEASE(mev); /* make sure our local procs are dead - but don't update their state * on the HNP as this may be redundant */ orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and abort */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); abort(); } wait_time = wait_time * 2; ++num_recursions; ORTE_MESSAGE_EVENT_DELAY(wait_time, mev); return; } wait_time = 1; num_recursions = 0; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:cmd:processor called by %s for tag %ld", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), (long)(tag))); /* save the original buffer pointers */ unpack_ptr = buffer->unpack_ptr; /* unpack the initial command */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); #if OMPI_ENABLE_DEBUG opal_output(0, "%s got message buffer from file %s line %d\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mev->file, mev->line); #endif goto CLEANUP; } /* see if this is a "process-and-relay" command - i.e., an xcast is underway */ if (ORTE_DAEMON_PROCESS_AND_RELAY_CMD == command) { /* get the target jobid and tag */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* save this buffer location */ save = buffer->unpack_ptr; /* unpack the command that will actually be executed */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* is this an add-procs cmd? */ if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) { /* yes - then it contains daemon update info - process it */ if (ORTE_SUCCESS != (ret = orte_odls_base_default_update_daemon_info(buffer))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* flag this location */ save = buffer->unpack_ptr; } /* rewind the buffer to the beginning */ buffer->unpack_ptr = unpack_ptr; /* do the relay */ send_relay(buffer, job, target_tag); /* rewind the buffer to the right place for processing the cmd */ buffer->unpack_ptr = save; } else { /* rewind the buffer so we can process it correctly */ buffer->unpack_ptr = unpack_ptr; } /* process the command */ if (ORTE_SUCCESS != (ret = process_commands(sender, buffer, tag))) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:cmd:processor failed on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orte:daemon:cmd:processor: processing commands completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); CLEANUP: OBJ_RELEASE(mev); /* reissue the non-blocking receive */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); } return; } static int process_commands(orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag) { orte_daemon_cmd_flag_t command; opal_buffer_t *relay_msg; int ret; orte_std_cntr_t n; int32_t signal; orte_jobid_t job; orte_rml_tag_t target_tag; char *contact_info; opal_buffer_t *answer; orte_rml_cmd_flag_t rml_cmd; orte_job_t *jdata; orte_process_name_t proc, proc2; int32_t status; orte_process_name_t *return_addr; int32_t num_replies; bool hnp_accounted_for; /* unpack the command */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); return ret; } /* now process the command locally */ switch(command) { /**** NULL ****/ case ORTE_DAEMON_NULL_CMD: ret = ORTE_SUCCESS; break; /**** KILL_LOCAL_PROCS ****/ case ORTE_DAEMON_KILL_LOCAL_PROCS: /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(job, true))) { ORTE_ERROR_LOG(ret); } break; /**** SIGNAL_LOCAL_PROCS ****/ case ORTE_DAEMON_SIGNAL_LOCAL_PROCS: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received signal_local_procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* get the signal */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &signal, &n, OPAL_INT32))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* signal them */ if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) { ORTE_ERROR_LOG(ret); } break; /**** ADD_LOCAL_PROCS ****/ case ORTE_DAEMON_ADD_LOCAL_PROCS: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received add_local_procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* launch the processes */ if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(buffer))) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted:comm:add_procs failed to launch on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } break; /**** TREE_SPAWN ****/ case ORTE_DAEMON_TREE_SPAWN: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received tree_spawn", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if the PLM supports remote spawn, pass it all along */ if (NULL != orte_plm.remote_spawn) { if (ORTE_SUCCESS != (ret = orte_plm.remote_spawn(buffer))) { ORTE_ERROR_LOG(ret); } } else { opal_output(0, "%s remote spawn is NULL!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } break; /**** DELIVER A MESSAGE TO THE LOCAL PROCS ****/ case ORTE_DAEMON_MESSAGE_LOCAL_PROCS: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received message_local_procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* unpack the jobid of the procs that are to receive the message */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* unpack the tag where we are to deliver the message */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted:comm:message_local_procs delivering message to job %s tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (int)target_tag)); relay_msg = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay_msg, buffer); /* if job=my_jobid, then this message is for us and not for our children */ if (ORTE_PROC_MY_NAME->jobid == job) { /* if the target tag is our xcast_barrier or rml_update, then we have * to handle the message as a special case. The RML has logic in it * intended to make it easier to use. This special logic mandates that * any message we "send" actually only goes into the queue for later * transmission. Thus, since we are already in a recv when we enter * the "process_commands" function, any attempt to "send" the relay * buffer to ourselves will only be added to the queue - it won't * actually be delivered until *after* we conclude the processing * of the current recv. * * The problem here is that, for messages where we need to relay * them along the orted chain, the rml_update * message contains contact info we may well need in order to do * the relay! So we need to process those messages immediately. * The only way to accomplish that is to (a) detect that the * buffer is intended for those tags, and then (b) process * those buffers here. * */ if (ORTE_RML_TAG_RML_INFO_UPDATE == target_tag) { n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(relay_msg, &rml_cmd, &n, ORTE_RML_CMD))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* initialize the routes to my peers - this will update the number * of daemons in the system (i.e., orte_process_info.num_procs) as * this might have changed */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, relay_msg))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } else { /* just deliver it to ourselves */ if ((ret = orte_rml.send_buffer(ORTE_PROC_MY_NAME, relay_msg, target_tag, 0)) < 0) { ORTE_ERROR_LOG(ret); } else { ret = ORTE_SUCCESS; opal_progress(); /* give us a chance to move the message along */ } } } else { /* must be for our children - deliver the message */ if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay_msg, target_tag))) { ORTE_ERROR_LOG(ret); } } OBJ_RELEASE(relay_msg); break; /**** COLLECTIVE DATA COMMAND ****/ case ORTE_DAEMON_COLL_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received collective data cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } if (ORTE_SUCCESS != (ret = orte_odls.collect_data(sender, buffer))) { ORTE_ERROR_LOG(ret); } break; /**** WAITPID_FIRED COMMAND ****/ case ORTE_DAEMON_WAITPID_FIRED: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received waitpid_fired cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* unpack the name of the proc that terminated */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* unpack the termination status */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &status, &n, OPAL_INT32))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* pass it down for processing */ orte_base_default_waitpid_fired(&proc, status); break; /**** IOF_COMPLETE COMMAND ****/ case ORTE_DAEMON_IOF_COMPLETE: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received iof_complete cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* unpack the name of the proc that completed */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* pass it down for processing */ orte_odls_base_notify_iof_complete(&proc); break; /**** EXIT COMMAND ****/ case ORTE_DAEMON_EXIT_WITH_REPLY_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* disable routing - we need to do this * because daemons exit in an uncoordinated fashion. * Thus, our routes are being dismantled, so we can't * trust that any given route still exists */ orte_routing_is_enabled = false; /* if we are the HNP, kill our local procs and * flag we are exited - but don't yet exit */ if (orte_process_info.hnp) { orte_job_t *daemons; orte_proc_t **procs; /* if we are the HNP, ensure our local procs are terminated */ orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); /* now lookup the daemon job object */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } procs = (orte_proc_t**)daemons->procs->addr; /* declare us terminated so things can exit cleanly */ procs[0]->state = ORTE_PROC_STATE_TERMINATED; daemons->num_terminated++; /* need to check for job complete as otherwise this doesn't * get triggered in single-daemon systems */ orte_plm_base_check_job_completed(daemons); /* all done! */ return ORTE_SUCCESS; } /* if we are not the HNP, send a message to the HNP telling * it we are leaving - and then trigger our exit */ { opal_buffer_t ack; orte_proc_state_t state=ORTE_PROC_STATE_TERMINATED; orte_exit_code_t exit_code=0; orte_plm_cmd_flag_t cmd = ORTE_PLM_UPDATE_PROC_STATE; OBJ_CONSTRUCT(&ack, opal_buffer_t); opal_dss.pack(&ack, &cmd, 1, ORTE_PLM_CMD); opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->jobid), 1, ORTE_JOBID); opal_dss.pack(&ack, &(ORTE_PROC_MY_NAME->vpid), 1, ORTE_VPID); opal_dss.pack(&ack, &state, 1, ORTE_PROC_STATE); opal_dss.pack(&ack, &exit_code, 1, ORTE_EXIT_CODE); orte_rml.send_buffer(ORTE_PROC_MY_HNP, &ack, ORTE_RML_TAG_PLM, 0); OBJ_DESTRUCT(&ack); } orte_trigger_event(&orte_exit); return ORTE_SUCCESS; break; /**** EXIT_NO_REPLY COMMAND ****/ case ORTE_DAEMON_EXIT_NO_REPLY_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received exit_no_reply", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* disable routing - we need to do this * because daemons exit in an uncoordinated fashion. * Thus, our routes are being dismantled, so we can't * trust that any given route still exists */ orte_routing_is_enabled = false; /* if we are the HNP, kill our local procs and * flag we are exited - but don't yet exit */ if (orte_process_info.hnp) { orte_job_t *daemons; orte_proc_t **procs; /* if we are the HNP, ensure our local procs are terminated */ orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); /* now lookup the daemon job object */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } procs = (orte_proc_t**)daemons->procs->addr; /* declare us terminated so things can exit cleanly */ procs[0]->state = ORTE_PROC_STATE_TERMINATED; daemons->num_terminated++; /* There is nothing more to do here - actual exit will be * accomplished by the plm */ return ORTE_SUCCESS; } orte_trigger_event(&orte_exit); return ORTE_SUCCESS; break; /**** HALT VM COMMAND ****/ case ORTE_DAEMON_HALT_VM_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received halt vm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* trigger our appropriate exit procedure * NOTE: this event will fire -after- any zero-time events * so any pending relays -do- get sent first */ orte_trigger_event(&orte_exit); return ORTE_SUCCESS; break; /**** SPAWN JOB COMMAND ****/ case ORTE_DAEMON_SPAWN_JOB_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received spawn job", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } answer = OBJ_NEW(opal_buffer_t); job = ORTE_JOBID_INVALID; /* can only process this if we are the HNP */ if (orte_process_info.hnp) { /* unpack the job data */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &jdata, &n, ORTE_JOB))) { ORTE_ERROR_LOG(ret); goto ANSWER_LAUNCH; } /* launch it */ if (ORTE_SUCCESS != (ret = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(ret); goto ANSWER_LAUNCH; } job = jdata->jobid; } ANSWER_LAUNCH: /* pack the jobid to be returned */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* return response */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } return ORTE_SUCCESS; break; /**** CONTACT QUERY COMMAND ****/ case ORTE_DAEMON_CONTACT_QUERY_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received contact query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* send back contact info */ contact_info = orte_rml.get_contact_info(); if (NULL == contact_info) { ORTE_ERROR_LOG(ORTE_ERROR); ret = ORTE_ERROR; goto CLEANUP; } /* setup buffer with answer */ answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &contact_info, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > orte_rml.send_buffer_nb(sender, answer, tag, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } break; /**** REPORT_JOB_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_JOB_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received job info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 procs so the tool won't hang */ if (!orte_process_info.hnp) { orte_std_cntr_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } else { /* if we are the HNP, process the request */ orte_std_cntr_t i, num_jobs=0; orte_job_t **jobs=NULL, *jobdat; /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* setup return */ answer = OBJ_NEW(opal_buffer_t); /* if they asked for a specific job, then just get that info */ if (ORTE_JOBID_WILDCARD != job) { if (NULL != (jobdat = orte_get_job_data_object(job))) { num_jobs = 1; jobs = &jobdat; } } else { /* count number of jobs */ for (i=0; i < orte_job_data->size; i++) { if (NULL == orte_job_data->addr[i]) break; num_jobs++; } jobs = (orte_job_t**)orte_job_data->addr; } /* pack the answer */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 < num_jobs) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, jobs, num_jobs, ORTE_JOB))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } break; /**** REPORT_NODE_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_NODE_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received node info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 nodes so the tool won't hang */ if (!orte_process_info.hnp) { orte_std_cntr_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } else { /* if we are the HNP, process the request */ orte_std_cntr_t i, num_nodes=0; orte_node_t **nodes; char *nid; /* unpack the nodename */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &nid, &n, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* setup return */ answer = OBJ_NEW(opal_buffer_t); /* if they asked for a specific node, then just get that info */ if (NULL != nid) { /* find this node */ nodes = (orte_node_t**)orte_node_pool->addr; for (i=0; i < orte_node_pool->size; i++) { if (NULL == nodes[i]) break; /* stop when we get past the end of data */ if (0 == strcmp(nid, nodes[i]->name)) { nodes = &nodes[i]; num_nodes = 1; break; } } } else { /* count number of nodes */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == orte_node_pool->addr[i]) break; num_nodes++; } nodes = (orte_node_t**)orte_node_pool->addr; } /* pack the answer */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 < num_nodes) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, nodes, num_nodes, ORTE_NODE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } break; /**** REPORT_PROC_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_PROC_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received proc info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 procs so the tool won't hang */ if (!orte_process_info.hnp) { orte_std_cntr_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } else { /* if we are the HNP, process the request */ orte_job_t *jdata; orte_proc_t **procs=NULL; orte_vpid_t num_procs=0, vpid; orte_std_cntr_t i; /* setup the answer */ answer = OBJ_NEW(opal_buffer_t); /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* look up job data object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto PACK_ANSWER; } /* unpack the vpid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) { ORTE_ERROR_LOG(ret); goto PACK_ANSWER; } /* if they asked for a specific proc, then just get that info */ if (ORTE_VPID_WILDCARD != vpid) { /* find this proc */ procs = (orte_proc_t**)jdata->procs->addr; for (i=0; i < jdata->procs->size; i++) { if (NULL == procs[i]) break; /* stop when we get past the end of data */ if (vpid == procs[i]->name.vpid) { procs = &procs[i]; num_procs = 1; break; } } } else { procs = (orte_proc_t**)jdata->procs->addr; num_procs = jdata->num_procs; } PACK_ANSWER: /* pack number of procs */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); goto SEND_ANSWER; } if (0 < num_procs) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, procs, jdata->num_procs, ORTE_PROC))) { ORTE_ERROR_LOG(ret); goto SEND_ANSWER; } } SEND_ANSWER: /* callback function will release buffer */ if (0 > orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } } break; /**** HEARTBEAT COMMAND ****/ case ORTE_DAEMON_HEARTBEAT_CMD: ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); ret = ORTE_ERR_NOT_IMPLEMENTED; break; /**** SYNC FROM LOCAL PROC ****/ case ORTE_DAEMON_SYNC_BY_PROC: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_recv: received sync from local proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); } if (ORTE_SUCCESS != (ret = orte_odls.require_sync(sender, buffer, false))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } break; case ORTE_DAEMON_SYNC_WANT_NIDMAP: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_recv: received sync+nidmap from local proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); } if (ORTE_SUCCESS != (ret = orte_odls.require_sync(sender, buffer, true))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } break; /**** TOP COMMAND ****/ case ORTE_DAEMON_TOP_CMD: /* setup the answer */ answer = OBJ_NEW(opal_buffer_t); num_replies = 0; hnp_accounted_for = false; n = 1; while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) { /* the jobid provided will, of course, have the job family of * the requestor. We need to convert that to our own job family */ proc.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, proc.jobid); if (orte_process_info.hnp) { return_addr = sender; /* if the request is for a wildcard vpid, then it goes to every * daemon. For scalability, we should probably xcast this some * day - but for now, we just loop */ if (ORTE_VPID_WILDCARD == proc.vpid) { /* loop across all daemons */ proc2.jobid = ORTE_PROC_MY_NAME->jobid; for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) { /* setup the cmd */ relay_msg = OBJ_NEW(opal_buffer_t); command = ORTE_DAEMON_TOP_CMD; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } /* the callback function will release relay_msg buffer */ if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay_msg); ret = ORTE_ERR_COMM_FAILURE; } num_replies++; } /* account for our own reply */ if (!hnp_accounted_for) { hnp_accounted_for = true; num_replies++; } /* now get the data for my own procs */ goto GET_TOP; } else { /* this is for a single proc - see which daemon * this rank is on */ if (ORTE_VPID_INVALID == (proc2.vpid = orte_ess.proc_get_daemon(&proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto SEND_TOP_ANSWER; } /* if the vpid is me, then just handle this myself */ if (proc2.vpid == ORTE_PROC_MY_NAME->vpid) { if (!hnp_accounted_for) { hnp_accounted_for = true; num_replies++; } goto GET_TOP; } /* otherwise, forward the cmd on to the appropriate daemon */ relay_msg = OBJ_NEW(opal_buffer_t); command = ORTE_DAEMON_TOP_CMD; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } proc2.jobid = ORTE_PROC_MY_NAME->jobid; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } /* the callback function will release relay_msg buffer */ if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay_msg); ret = ORTE_ERR_COMM_FAILURE; } } /* end if HNP */ } else { /* this came from the HNP, but needs to go back to the original * requestor. Unpack the name of that entity first */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc2, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); goto SEND_TOP_ANSWER; } return_addr = &proc2; GET_TOP: /* this rank must be local to me, or the HNP wouldn't * have sent it to me - process the request */ if (ORTE_SUCCESS != (ret = orte_odls_base_get_proc_stats(answer, &proc))) { ORTE_ERROR_LOG(ret); goto SEND_TOP_ANSWER; } } } SEND_TOP_ANSWER: /* send the answer back to requester - callback * function will release buffer */ if (orte_process_info.hnp) { /* if I am the HNP, I need to also provide the number of * replies the caller should recv and the sample time */ time_t mytime; char *cptr; relay_msg = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &num_replies, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); } time(&mytime); cptr = ctime(&mytime); cptr[strlen(cptr)-1] = '\0'; /* remove trailing newline */ if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &cptr, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } /* copy the stats payload */ opal_dss.copy_payload(relay_msg, answer); OBJ_RELEASE(answer); answer = relay_msg; } if (0 > orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL, 0, send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; } break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); ret = ORTE_ERR_BAD_PARAM; } CLEANUP: return ret; }