1
1
r15390 - Changed the paradigm in which the runtime worked by enabling the mpirun
process to become an orted and spawn processes. This broke the C/R for this 
special case as it required that the orted start the process, and that 
the hierarchy remains.
The fix was to allow the global coordinator to be a local coordinator as well
for this case.

r15528 - Changed the selection logic for the RML. This caused the application to
segv if the 'ftrm' wrapper component was selected as it tried to modify a NULL
pointer.
The fix was to move the 'module swap' code into the init() function, and swap
when passed a NULL pointer. It sounds bad, but actually cleans up the code a bit
more.

Still have to fix the 'routed' framework.

This commit was SVN r15566.

The following SVN revision numbers were found above:
  r15390 --> open-mpi/ompi@bd65f8ba88
  r15528 --> open-mpi/ompi@39a6057fc6
Этот коммит содержится в:
Josh Hursey 2007-07-23 20:13:37 +00:00
родитель 2df5576d1d
Коммит a24e530f8e
6 изменённых файлов: 225 добавлений и 81 удалений

Просмотреть файл

@ -38,6 +38,9 @@ extern "C" {
ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ftrm_component;
ORTE_MODULE_DECLSPEC extern orte_rml_module_t orte_rml_ftrm_module;
ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ftrm_wrapped_component;
ORTE_MODULE_DECLSPEC extern orte_rml_module_t orte_rml_ftrm_wrapped_module;
/*
* Init (Component)
*/

Просмотреть файл

@ -95,8 +95,30 @@ static int ftrm_priority = -1;
*/
orte_rml_module_t* orte_rml_ftrm_component_init(int* priority)
{
*priority = ftrm_priority;
return &orte_rml_ftrm_module;
/*
* Asked to return a priority
*/
if( NULL != priority ) {
*priority = ftrm_priority;
return &orte_rml_ftrm_module;
}
/*
* Called a second time to swap module pointers
*/
else {
/* Copy the wrapped versions */
orte_rml_ftrm_wrapped_module = orte_rml;
mca_rml_ftrm_wrapped_component = *orte_rml_component;
/* Replace with ourselves */
orte_rml = orte_rml_ftrm_module;
orte_rml_component = &mca_rml_ftrm_component;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: component_init(): Wrapped Component (%s)",
mca_rml_ftrm_wrapped_component.rml_version.mca_component_name);
return NULL;
}
}
/*

Просмотреть файл

@ -28,54 +28,26 @@
#include "rml_ftrm.h"
orte_rml_component_t wrapped_component;
orte_rml_module_t wrapped_module;
orte_rml_component_t mca_rml_ftrm_wrapped_component;
orte_rml_module_t orte_rml_ftrm_wrapped_module;
/*
* Init (Module)
*/
static int num_inits = 0;
int orte_rml_ftrm_module_enable_comm(void)
{
int ret;
/*
* This is the first time init was called
* just need to swap some pointers
* Called from rml_base_select.c
*/
if( 0 == num_inits ) {
/* Copy the wrapped versions */
wrapped_module = orte_rml;
wrapped_component = *orte_rml_component;
/* Replace with ourselves */
orte_rml = orte_rml_ftrm_module;
orte_rml_component = &mca_rml_ftrm_component;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: module_init(): Normal...");
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: module_init(): Wrapped Component (%s)",
wrapped_component.rml_version.mca_component_name);
}
/*
* This is *not* the first time we have been called
* now we need to send this information to the actual
* component.
* Called from orte_init_stage1.c
*/
else {
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: module_init(): Normal...");
if( NULL != wrapped_module.enable_comm ) {
if( ORTE_SUCCESS != (ret = wrapped_module.enable_comm() ) ) {
return ret;
}
if( NULL != orte_rml_ftrm_wrapped_module.enable_comm ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.enable_comm() ) ) {
return ret;
}
}
num_inits++;
return ORTE_SUCCESS;
}
@ -90,8 +62,8 @@ int orte_rml_ftrm_module_finalize(void)
"orte_rml_ftrm: module_finalize()");
if( NULL != wrapped_module.finalize ) {
if( ORTE_SUCCESS != (ret = wrapped_module.finalize() ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.finalize ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.finalize() ) ) {
return ret;
}
}
@ -107,8 +79,8 @@ int orte_rml_ftrm_get_new_name(orte_process_name_t *name)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: get_new_name()");
if( NULL != wrapped_module.get_new_name ) {
if( ORTE_SUCCESS != (ret = wrapped_module.get_new_name(name) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.get_new_name ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.get_new_name(name) ) ) {
return ret;
}
}
@ -128,8 +100,8 @@ char * orte_rml_ftrm_get_contact_info(void)
"orte_rml_ftrm: get_uri()");
if( NULL != wrapped_module.get_contact_info ) {
rtn_val = wrapped_module.get_contact_info();
if( NULL != orte_rml_ftrm_wrapped_module.get_contact_info ) {
rtn_val = orte_rml_ftrm_wrapped_module.get_contact_info();
}
return rtn_val;
@ -145,8 +117,8 @@ int orte_rml_ftrm_set_contact_info(const char* contact_info)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: set_contact_info()");
if( NULL != wrapped_module.set_contact_info ) {
if( ORTE_SUCCESS != (ret = wrapped_module.set_contact_info(contact_info) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.set_contact_info ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.set_contact_info(contact_info) ) ) {
return ret;
}
}
@ -165,8 +137,8 @@ int orte_rml_ftrm_ping(const char* uri, const struct timeval* tv)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: ping()");
if( NULL != wrapped_module.ping ) {
if( ORTE_SUCCESS != (ret = wrapped_module.ping(uri, tv) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.ping ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.ping(uri, tv) ) ) {
return ret;
}
}
@ -187,10 +159,11 @@ int orte_rml_ftrm_send(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: send()");
"orte_rml_ftrm: send(%s, %d, %d, %d )",
ORTE_NAME_PRINT(peer), count, tag, flags);
if( NULL != wrapped_module.send ) {
if( ORTE_SUCCESS != (ret = wrapped_module.send(peer, msg, count, tag, flags) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.send ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send(peer, msg, count, tag, flags) ) ) {
return ret;
}
}
@ -212,10 +185,11 @@ int orte_rml_ftrm_send_nb(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: send_nb()");
"orte_rml_ftrm: send_nb(%s, %d, %d, %d )",
ORTE_NAME_PRINT(peer), count, tag, flags);
if( NULL != wrapped_module.send_nb ) {
if( ORTE_SUCCESS != (ret = wrapped_module.send_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.send_nb ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) {
return ret;
}
}
@ -234,10 +208,11 @@ int orte_rml_ftrm_send_buffer(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: send_buffer()");
"orte_rml_ftrm: send_buffer(%s, %d, %d )",
ORTE_NAME_PRINT(peer), tag, flags);
if( NULL != wrapped_module.send_buffer ) {
if( ORTE_SUCCESS != (ret = wrapped_module.send_buffer(peer, buffer, tag, flags) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.send_buffer ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_buffer(peer, buffer, tag, flags) ) ) {
return ret;
}
}
@ -258,10 +233,11 @@ int orte_rml_ftrm_send_buffer_nb(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: send_buffer_nb()");
"orte_rml_ftrm: send_buffer_nb(%s, %d, %d )",
ORTE_NAME_PRINT(peer), tag, flags);
if( NULL != wrapped_module.send_buffer_nb ) {
if( ORTE_SUCCESS != (ret = wrapped_module.send_buffer_nb(peer, buffer, tag, flags, cbfunc, cbdata) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.send_buffer_nb ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_buffer_nb(peer, buffer, tag, flags, cbfunc, cbdata) ) ) {
return ret;
}
}
@ -282,10 +258,11 @@ int orte_rml_ftrm_recv(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: recv()");
"orte_rml_ftrm: recv(%s, %d, %d, %d )",
ORTE_NAME_PRINT(peer), count, tag, flags);
if( NULL != wrapped_module.recv ) {
if( ORTE_SUCCESS != (ret = wrapped_module.recv(peer, msg, count, tag, flags) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.recv ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv(peer, msg, count, tag, flags) ) ) {
return ret;
}
}
@ -307,10 +284,11 @@ int orte_rml_ftrm_recv_nb(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: recv_nb()");
"orte_rml_ftrm: recv_nb(%s, %d, %d, %d )",
ORTE_NAME_PRINT(peer), count, tag, flags);
if( NULL != wrapped_module.recv_nb ) {
if( ORTE_SUCCESS != (ret = wrapped_module.recv_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.recv_nb ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) {
return ret;
}
}
@ -329,10 +307,11 @@ int orte_rml_ftrm_recv_buffer(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: recv_buffer()");
"orte_rml_ftrm: recv_buffer(%s, %d )",
ORTE_NAME_PRINT(peer), tag);
if( NULL != wrapped_module.recv_buffer ) {
if( ORTE_SUCCESS != (ret = wrapped_module.recv_buffer(peer, buf, tag, flags) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.recv_buffer ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_buffer(peer, buf, tag, flags) ) ) {
return ret;
}
}
@ -352,10 +331,11 @@ int orte_rml_ftrm_recv_buffer_nb(orte_process_name_t* peer,
int ret;
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: recv_buffer_nb()");
"orte_rml_ftrm: recv_buffer_nb(%s, %d, %d)",
ORTE_NAME_PRINT(peer), tag, flags);
if( NULL != wrapped_module.recv_buffer_nb ) {
if( ORTE_SUCCESS != (ret = wrapped_module.recv_buffer_nb(peer, tag, flags, cbfunc, cbdata) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.recv_buffer_nb ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_buffer_nb(peer, tag, flags, cbfunc, cbdata) ) ) {
return ret;
}
}
@ -373,8 +353,8 @@ int orte_rml_ftrm_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: recv_cancel()");
if( NULL != wrapped_module.recv_cancel ) {
if( ORTE_SUCCESS != (ret = wrapped_module.recv_cancel(peer, tag) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.recv_cancel ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_cancel(peer, tag) ) ) {
return ret;
}
}
@ -393,8 +373,8 @@ int orte_rml_ftrm_add_exception_handler(orte_rml_exception_callback_t cbfunc)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: add_exception_handler()");
if( NULL != wrapped_module.add_exception_handler ) {
if( ORTE_SUCCESS != (ret = wrapped_module.add_exception_handler(cbfunc) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.add_exception_handler ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.add_exception_handler(cbfunc) ) ) {
return ret;
}
}
@ -409,8 +389,8 @@ int orte_rml_ftrm_del_exception_handler(orte_rml_exception_callback_t cbfunc)
opal_output_verbose(20, rml_ftrm_output_handle,
"orte_rml_ftrm: del_exception_handler()");
if( NULL != wrapped_module.del_exception_handler ) {
if( ORTE_SUCCESS != (ret = wrapped_module.del_exception_handler(cbfunc) ) ) {
if( NULL != orte_rml_ftrm_wrapped_module.del_exception_handler ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.del_exception_handler(cbfunc) ) ) {
return ret;
}
}
@ -447,8 +427,8 @@ int orte_rml_ftrm_ft_event(int state)
/*
* The wrapped component is responsible for calling the OOB modules
*/
if( NULL != wrapped_module.ft_event ) {
if( ORTE_SUCCESS != (ret = wrapped_module.ft_event(state))) {
if( NULL != orte_rml_ftrm_wrapped_module.ft_event ) {
if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.ft_event(state))) {
return ret;
}
}

Просмотреть файл

@ -324,6 +324,19 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *
pid_t my_pid;
bool ack = true;
/*
* Do not send to self, as that is silly.
*/
if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_HNP) ||
0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME) ) {
opal_output_verbose(10, orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: Do not send to self!\n");
return ORTE_SUCCESS;
}
opal_output_verbose(10, orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Sending commands\n");
/*
* Setup the buffer that we may send back
*/
@ -337,21 +350,33 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *
********************/
my_pid = getpid();
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &my_pid, 1, ORTE_PID))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: DSS Pack (PID) failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: Send Buffer (PID) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
/* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: ACK (PID) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( !ack ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: NACK (PID) (LINE = %d)\n",
__LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -366,12 +391,18 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *
}
if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: Recv (term) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
n = 1;
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, term, &n, ORTE_BOOL)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
@ -386,12 +417,18 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool *
}
if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: Recv (jobid) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
n = 1;
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, jobid, &n, ORTE_SIZE)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
@ -413,11 +450,17 @@ int orte_snapc_base_global_coord_recv_ack(orte_process_name_t* peer, bool *ack)
}
if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: recv_ack: Error: Recv Failed: %d\n",
ret);
exit_status = ret;
goto cleanup;
}
if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, ack, &n, ORTE_BOOL)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: recv_ack: Error: Unpack Failed: %d\n",
ret);
exit_status = ret;
goto cleanup;
}
@ -439,11 +482,17 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack)
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(buffer, &ack, 1, ORTE_BOOL))) {
opal_output(orte_snapc_base_output,
"snapc:base: send_ack: Error: Pack Failed: %d\n",
ret);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0))) {
opal_output(orte_snapc_base_output,
"snapc:base: send_ack: Error: Send Failed: %d\n",
ret);
exit_status = ret;
goto cleanup;
}
@ -462,6 +511,20 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
bool ack = true;
size_t str_len = 0;
/*
* Do not send to self, as that is silly.
*/
if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_HNP) ||
0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME) ) {
opal_output_verbose(10, orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: Do not send to self!\n");
return ORTE_SUCCESS;
}
opal_output_verbose(10, orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Sending update command <%s> <%d> <%d>\n",
global_snapshot_handle, seq_num, ckpt_status);
/*
* Setup the buffer that we may send back
*/
@ -483,20 +546,32 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &ckpt_status, 1, ORTE_INT))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
/* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: ACK (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( !ack ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: NACK (ckpt_status) (LINE = %d)\n",
__LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -520,20 +595,32 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
str_len = strlen(global_snapshot_handle);
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &str_len, 1, ORTE_SIZE))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: DSS Pack (snapshot ref length) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: Send (snapshot ref length) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
/* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: Send (snapshot ref length) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( !ack ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: NACK (snapshot ref length) (LINE = %d)\n",
__LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
@ -551,24 +638,39 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &global_snapshot_handle, 1, ORTE_STRING))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &seq_num, 1, ORTE_INT))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: Send (snapshot handle, seq number) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ret;
goto cleanup;
}
/* ACK */
if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: ACK (snapshot handle, seq number) Failure (ret = %d) (LINE = %d)\n",
ret, __LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}
if( !ack ) {
opal_output(orte_snapc_base_output,
"snapc:base: ckpt_update_cmd: Error: NACK (snapshot handle, seq number) (LINE = %d)\n",
__LINE__);
exit_status = ORTE_ERROR;
goto cleanup;
}

Просмотреть файл

@ -82,10 +82,11 @@ static bool snapc_full_global_is_done_yet(void);
static opal_mutex_t global_coord_mutex;
static orte_snapc_base_global_snapshot_t global_snapshot;
static orte_process_name_t orte_checkpoint_sender;
static orte_process_name_t orte_checkpoint_sender = {0,0};
static bool updated_job_to_running;
static size_t cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE;
static orte_jobid_t cur_job_id = 0;
/************************
* Function Definitions
@ -109,6 +110,42 @@ int global_coord_setup_job(orte_jobid_t jobid) {
orte_vpid_t vpid_start = 0, vpid_range = 0;
orte_std_cntr_t i;
/*
* If we have already setup a jobid, warn
* JJH: Hard restriction of only one jobid able to be checkpointed. FIX
*/
/*
* If we pass this way twice the first time will have been from:
* rmgr_urm.c: As the global coordinator
* The second time will have been from:
* odls_default_module.c: As the local coordinator.
* The later case means that we (as the HNP) are acting as both the global and
* local coordinators.
* JJH FIX NOTE:
* This fix imposes the restriction that only one jobid can be checkpointed
* at a time. In the future we will want to lift this restriction.
*/
if( 0 >= cur_job_id ) {
/* Global Coordinator pass */
cur_job_id = jobid;
}
else if ( jobid == cur_job_id ) {
/* Local Coordinator pass -- Will always happen after Global Coordinator Pass */
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
"global [%d]) Setup job (%d) again as the local coordinator for (%d)\n",
getpid(), jobid, cur_job_id);
return local_coord_setup_job(jobid);
}
else {
/* Already setup things for another job,
* We do not currently support the ability to checkpoint more than one
* jobid
*/
opal_output(mca_snapc_full_component.super.output_handle,
"global [%d]) Setup job (%d) Failed. Already setup job (%d)\n", getpid(), jobid, cur_job_id);
return ORTE_ERROR;
}
/*
* Start out with a sequence number just below the first
* This will be incremented when we checkpoint

Просмотреть файл

@ -426,7 +426,7 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot)
vpid_snapshot = (orte_snapc_base_snapshot_t*)item;
fprintf(appfile, "#\n");
fprintf(appfile, "# Old Process Name: %%u.%u\n",
fprintf(appfile, "# Old Process Name: %u.%u\n",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid);
fprintf(appfile, "#\n");