8f45fcb429
* Fix the checkpoint-restart-checkpoint case which would previous reject the checkpoint of the newly restarted process. By making sure to re-enable checkpointing once the application has fully restarted fixes this issue (make sure to set is_app_checkpointable to true on restart confirmation). * In the case of an invalid checkpoint, do not try to access the SStore datastore as it will be using a dummy handler, and return NULL strings. mpirun was segfaulting in the error case because it was trying to convert the seq_num from a string to an integer. * Make sure to initialize the timer event in the Automatic Recovery section of the HNP errmgr, per the libevent update. This caused a segfault when attempting to recover a failed process. * If ompi-checkpoint loses connection to the HNP/mpirun the TCP socket will fail and call the ErrMgr update_state function. This commit adds a dummy function {{{orte_errmgr_base_update_state()}}} that will prevent the ompi-checkpoint command from segfaulting in this error scenario. This commit was SVN r24306.
76 строки
2.2 KiB
C
76 строки
2.2 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2010 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/** @file:
|
|
*/
|
|
|
|
#ifndef ORTE_MCA_ERRMGR_PRIVATE_H
|
|
#define ORTE_MCA_ERRMGR_PRIVATE_H
|
|
|
|
/*
|
|
* includes
|
|
*/
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include "opal/dss/dss_types.h"
|
|
#include "orte/mca/plm/plm_types.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
/*
|
|
* Functions for use solely within the ERRMGR framework
|
|
*/
|
|
BEGIN_C_DECLS
|
|
|
|
/* define a struct to hold framework-global values */
|
|
typedef struct {
|
|
int output;
|
|
bool initialized;
|
|
} orte_errmgr_base_t;
|
|
|
|
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
|
|
|
/* Define the ERRMGR command flag */
|
|
typedef uint8_t orte_errmgr_cmd_flag_t;
|
|
#define ORTE_ERRMGR_CMD OPAL_UINT8
|
|
|
|
/* define some commands */
|
|
#define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01
|
|
#define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02
|
|
|
|
/*
|
|
* Base functions
|
|
*/
|
|
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
|
|
|
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
|
__opal_attribute_format__(__printf__, 2, 3)
|
|
__opal_attribute_noreturn__;
|
|
|
|
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
|
orte_job_state_t jobstate,
|
|
orte_process_name_t *proc_name,
|
|
orte_proc_state_t state,
|
|
pid_t pid,
|
|
orte_exit_code_t exit_code);
|
|
|
|
END_C_DECLS
|
|
#endif
|