7bee71aa59
Add a new function to opal_progress that tells us our recursion depth to support that solution. Yes, I know this sounds picky, but good ol' Jeff managed to make it happen by driving his cluster near to death... Also ensure that we declare "failed" for the daemon job when daemons fail instead of the application job. This is important so that orte knows that it cannot use xcast to tell daemons to "exit", nor should it expect all daemons to respond. Otherwise, it is possible to hang. After lots of testing, decide to default (again) to slurm detecting failed orteds. This proved necessary to avoid rather annoying hangs that were difficult to recover from. There are conditions where slurm will fail to launch all daemons (slurm folks are working on it), and yet again, good ol' Jeff managed to find both of them. Thanks you Jeff! :-/ This commit was SVN r18611.
49 строки
1.4 KiB
C
49 строки
1.4 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef ORTE_PLM_SLURM_EXPORT_H
|
|
#define ORTE_PLM_SLURM_EXPORT_H
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "orte/mca/plm/plm.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
struct orte_plm_slurm_component_t {
|
|
orte_plm_base_component_t super;
|
|
int priority;
|
|
char *orted;
|
|
char *custom_args;
|
|
};
|
|
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
|
|
|
|
/*
|
|
* Globally exported variable
|
|
*/
|
|
|
|
ORTE_MODULE_DECLSPEC extern orte_plm_slurm_component_t
|
|
mca_plm_slurm_component;
|
|
ORTE_DECLSPEC extern orte_plm_base_module_t
|
|
orte_plm_slurm_module;
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* ORTE_PLM_SLURM_EXPORT_H */
|