/* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include "orte/constants.h" #include "opal/util/output.h" #ifdef HAVE_STRING_H #include #endif #include #ifdef HAVE_STDLIB_H #include #endif /* HAVE_STDLIB_H */ #ifdef HAVE_STRINGS_H #include #endif /* HAVE_STRINGS_H */ #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #include #include "opal/util/argv.h" #include "opal/util/os_path.h" #include "opal/util/path.h" #include "opal/util/opal_environ.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/debugger/base/base.h" #define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); #if !ORTE_DISABLE_FULL_SUPPORT void orte_debugger_base_dump(void) { int i; DUMP_INT(MPIR_being_debugged); DUMP_INT(MPIR_debug_state); DUMP_INT(MPIR_partial_attach_ok); DUMP_INT(MPIR_i_am_starter); DUMP_INT(MPIR_forward_output); DUMP_INT(MPIR_proctable_size); fprintf(stderr, " MPIR_proctable:\n"); for (i = 0; i < MPIR_proctable_size; i++) { fprintf(stderr, " (i, host, exe, pid) = (%d, %s, %s, %d)\n", i, MPIR_proctable[i].host_name, MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); } fprintf(stderr, "MPIR_executable_path: %s\n", ('\0' == MPIR_executable_path[0]) ? "NULL" : (char*) MPIR_executable_path); fprintf(stderr, "MPIR_server_arguments: %s\n", ('\0' == MPIR_server_arguments[0]) ? "NULL" : (char*) MPIR_server_arguments); } /* * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after spawn * * NOTE: We -always- perform this step to ensure that any debugger * that attaches to us post-launch of the application can get a * completed proctable */ void orte_debugger_base_init_after_spawn(orte_job_t *jdata) { orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; opal_buffer_t buf; orte_process_name_t rank0; int rc; if (MPIR_proctable) { /* already initialized */ opal_output_verbose(5, orte_debugger_base.output, "%s: debugger already initialized", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return; } /* fill in the proc table for the application processes */ opal_output_verbose(5, orte_debugger_base.output, "%s: Setting up debugger process table for applications", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); MPIR_debug_state = 1; /* set the total number of processes in the job */ MPIR_proctable_size = jdata->num_procs; /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return; } if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); } /* initialize MPIR_proctable */ for (j=0; j < jdata->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->name.vpid; if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { continue; } MPIR_proctable[i].host_name = strdup(proc->node->name); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); } } if (0 < opal_output_get_verbosity(orte_debugger_base.output)) { orte_debugger_base_dump(); } /* if we are being launched under a debugger, then we must wait * for it to be ready to go and do some things to start the job */ if (MPIR_being_debugged) { /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 */ ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); (void) MPIR_Breakpoint(); /* send a message to rank=0 to release it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */ rank0.jobid = jdata->jobid; rank0.vpid = 0; if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) { opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); } OBJ_DESTRUCT(&buf); } } #endif /* * Breakpoint function for parallel debuggers */ void *MPIR_Breakpoint(void) { return NULL; }