
The problem was tracked to use of the grpcomm.onesided_barrier to control daemon/mpirun termination. This relied on messaging -and- required that the program counter jump from the errmgr back to grpcomm. On rare occasions, this jump did not occur, causing mpirun to hang. This patch looks more invasive than it is - most of the affected files simply had one or two lines removed. The essence of the change is: * pulled the job_complete and quit routines out of orterun and orted_main and put them in a common place * modified the errmgr to directly call the new routines when termination is detected * removed the grpcomm.onesided_barrier and its associated RML tag * add a new "num_routes" API to the routed framework that reports back the number of dependent routes. When route_lost is called, the daemon's list of "children" is checked and adjusted if that route went to a "leaf" in the routing tree * use connection termination between daemons to track rollup of the daemon tree. Daemons and HNP now terminate once num_routes returns zero Also picked up in this commit is the addition of a new bool flag to the app_context struct, and increasing the job_control field from 8 to 16 bits. Both trivial. This commit was SVN r23429.
78 строки
1.9 KiB
C
78 строки
1.9 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef ORTERUN_ORTERUN_H
|
|
#define ORTERUN_ORTERUN_H
|
|
|
|
#include "orte_config.h"
|
|
#include "opal/threads/mutex.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
/**
|
|
* Main body of orterun functionality
|
|
*/
|
|
int orterun(int argc, char *argv[]);
|
|
|
|
/**
|
|
* Global struct for catching orterun command line options.
|
|
*/
|
|
struct orterun_globals_t {
|
|
bool help;
|
|
bool version;
|
|
bool verbose;
|
|
char *report_pid;
|
|
char *report_uri;
|
|
bool exit;
|
|
bool by_node;
|
|
bool by_slot;
|
|
bool by_board;
|
|
bool by_socket;
|
|
bool bind_to_none;
|
|
bool bind_to_core;
|
|
bool bind_to_board;
|
|
bool bind_to_socket;
|
|
bool debugger;
|
|
int num_procs;
|
|
char *env_val;
|
|
char *appfile;
|
|
char *wdir;
|
|
char *path;
|
|
bool preload_binary;
|
|
char *preload_files;
|
|
char *preload_files_dest_dir;
|
|
opal_mutex_t lock;
|
|
bool sleep;
|
|
char *ompi_server;
|
|
bool wait_for_server;
|
|
int server_wait_timeout;
|
|
char *stdin_target;
|
|
bool disable_recovery;
|
|
};
|
|
|
|
/**
|
|
* Struct holding values gleaned from the orterun command line -
|
|
* needed by debugger init
|
|
*/
|
|
ORTE_DECLSPEC extern struct orterun_globals_t orterun_globals;
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* ORTERUN_ORTERUN_H */
|