diff --git a/orte/mca/rml/ftrm/rml_ftrm.h b/orte/mca/rml/ftrm/rml_ftrm.h index 73eb8632e5..3d54f8f71b 100644 --- a/orte/mca/rml/ftrm/rml_ftrm.h +++ b/orte/mca/rml/ftrm/rml_ftrm.h @@ -38,6 +38,9 @@ extern "C" { ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ftrm_component; ORTE_MODULE_DECLSPEC extern orte_rml_module_t orte_rml_ftrm_module; + ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_ftrm_wrapped_component; + ORTE_MODULE_DECLSPEC extern orte_rml_module_t orte_rml_ftrm_wrapped_module; + /* * Init (Component) */ diff --git a/orte/mca/rml/ftrm/rml_ftrm_component.c b/orte/mca/rml/ftrm/rml_ftrm_component.c index f5d6b63a88..958b80c368 100644 --- a/orte/mca/rml/ftrm/rml_ftrm_component.c +++ b/orte/mca/rml/ftrm/rml_ftrm_component.c @@ -95,8 +95,30 @@ static int ftrm_priority = -1; */ orte_rml_module_t* orte_rml_ftrm_component_init(int* priority) { - *priority = ftrm_priority; - return &orte_rml_ftrm_module; + /* + * Asked to return a priority + */ + if( NULL != priority ) { + *priority = ftrm_priority; + return &orte_rml_ftrm_module; + } + /* + * Called a second time to swap module pointers + */ + else { + /* Copy the wrapped versions */ + orte_rml_ftrm_wrapped_module = orte_rml; + mca_rml_ftrm_wrapped_component = *orte_rml_component; + /* Replace with ourselves */ + orte_rml = orte_rml_ftrm_module; + orte_rml_component = &mca_rml_ftrm_component; + + opal_output_verbose(20, rml_ftrm_output_handle, + "orte_rml_ftrm: component_init(): Wrapped Component (%s)", + mca_rml_ftrm_wrapped_component.rml_version.mca_component_name); + + return NULL; + } } /* diff --git a/orte/mca/rml/ftrm/rml_ftrm_module.c b/orte/mca/rml/ftrm/rml_ftrm_module.c index 88d77c35e8..d044052f73 100644 --- a/orte/mca/rml/ftrm/rml_ftrm_module.c +++ b/orte/mca/rml/ftrm/rml_ftrm_module.c @@ -28,54 +28,26 @@ #include "rml_ftrm.h" -orte_rml_component_t wrapped_component; -orte_rml_module_t wrapped_module; +orte_rml_component_t mca_rml_ftrm_wrapped_component; +orte_rml_module_t orte_rml_ftrm_wrapped_module; /* * Init (Module) */ -static int num_inits = 0; int orte_rml_ftrm_module_enable_comm(void) { int ret; - /* - * This is the first time init was called - * just need to swap some pointers - * Called from rml_base_select.c - */ - if( 0 == num_inits ) { - /* Copy the wrapped versions */ - wrapped_module = orte_rml; - wrapped_component = *orte_rml_component; - /* Replace with ourselves */ - orte_rml = orte_rml_ftrm_module; - orte_rml_component = &mca_rml_ftrm_component; + opal_output_verbose(20, rml_ftrm_output_handle, + "orte_rml_ftrm: module_init(): Normal..."); - opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: module_init(): Wrapped Component (%s)", - wrapped_component.rml_version.mca_component_name); - } - /* - * This is *not* the first time we have been called - * now we need to send this information to the actual - * component. - * Called from orte_init_stage1.c - */ - else { - opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: module_init(): Normal..."); - - if( NULL != wrapped_module.enable_comm ) { - if( ORTE_SUCCESS != (ret = wrapped_module.enable_comm() ) ) { - return ret; - } + if( NULL != orte_rml_ftrm_wrapped_module.enable_comm ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.enable_comm() ) ) { + return ret; } } - num_inits++; - return ORTE_SUCCESS; } @@ -90,8 +62,8 @@ int orte_rml_ftrm_module_finalize(void) "orte_rml_ftrm: module_finalize()"); - if( NULL != wrapped_module.finalize ) { - if( ORTE_SUCCESS != (ret = wrapped_module.finalize() ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.finalize ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.finalize() ) ) { return ret; } } @@ -107,8 +79,8 @@ int orte_rml_ftrm_get_new_name(orte_process_name_t *name) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: get_new_name()"); - if( NULL != wrapped_module.get_new_name ) { - if( ORTE_SUCCESS != (ret = wrapped_module.get_new_name(name) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.get_new_name ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.get_new_name(name) ) ) { return ret; } } @@ -128,8 +100,8 @@ char * orte_rml_ftrm_get_contact_info(void) "orte_rml_ftrm: get_uri()"); - if( NULL != wrapped_module.get_contact_info ) { - rtn_val = wrapped_module.get_contact_info(); + if( NULL != orte_rml_ftrm_wrapped_module.get_contact_info ) { + rtn_val = orte_rml_ftrm_wrapped_module.get_contact_info(); } return rtn_val; @@ -145,8 +117,8 @@ int orte_rml_ftrm_set_contact_info(const char* contact_info) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: set_contact_info()"); - if( NULL != wrapped_module.set_contact_info ) { - if( ORTE_SUCCESS != (ret = wrapped_module.set_contact_info(contact_info) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.set_contact_info ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.set_contact_info(contact_info) ) ) { return ret; } } @@ -165,8 +137,8 @@ int orte_rml_ftrm_ping(const char* uri, const struct timeval* tv) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: ping()"); - if( NULL != wrapped_module.ping ) { - if( ORTE_SUCCESS != (ret = wrapped_module.ping(uri, tv) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.ping ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.ping(uri, tv) ) ) { return ret; } } @@ -187,10 +159,11 @@ int orte_rml_ftrm_send(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: send()"); + "orte_rml_ftrm: send(%s, %d, %d, %d )", + ORTE_NAME_PRINT(peer), count, tag, flags); - if( NULL != wrapped_module.send ) { - if( ORTE_SUCCESS != (ret = wrapped_module.send(peer, msg, count, tag, flags) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.send ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send(peer, msg, count, tag, flags) ) ) { return ret; } } @@ -212,10 +185,11 @@ int orte_rml_ftrm_send_nb(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: send_nb()"); + "orte_rml_ftrm: send_nb(%s, %d, %d, %d )", + ORTE_NAME_PRINT(peer), count, tag, flags); - if( NULL != wrapped_module.send_nb ) { - if( ORTE_SUCCESS != (ret = wrapped_module.send_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.send_nb ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) { return ret; } } @@ -234,10 +208,11 @@ int orte_rml_ftrm_send_buffer(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: send_buffer()"); + "orte_rml_ftrm: send_buffer(%s, %d, %d )", + ORTE_NAME_PRINT(peer), tag, flags); - if( NULL != wrapped_module.send_buffer ) { - if( ORTE_SUCCESS != (ret = wrapped_module.send_buffer(peer, buffer, tag, flags) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.send_buffer ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_buffer(peer, buffer, tag, flags) ) ) { return ret; } } @@ -258,10 +233,11 @@ int orte_rml_ftrm_send_buffer_nb(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: send_buffer_nb()"); + "orte_rml_ftrm: send_buffer_nb(%s, %d, %d )", + ORTE_NAME_PRINT(peer), tag, flags); - if( NULL != wrapped_module.send_buffer_nb ) { - if( ORTE_SUCCESS != (ret = wrapped_module.send_buffer_nb(peer, buffer, tag, flags, cbfunc, cbdata) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.send_buffer_nb ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.send_buffer_nb(peer, buffer, tag, flags, cbfunc, cbdata) ) ) { return ret; } } @@ -282,10 +258,11 @@ int orte_rml_ftrm_recv(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: recv()"); + "orte_rml_ftrm: recv(%s, %d, %d, %d )", + ORTE_NAME_PRINT(peer), count, tag, flags); - if( NULL != wrapped_module.recv ) { - if( ORTE_SUCCESS != (ret = wrapped_module.recv(peer, msg, count, tag, flags) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.recv ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv(peer, msg, count, tag, flags) ) ) { return ret; } } @@ -307,10 +284,11 @@ int orte_rml_ftrm_recv_nb(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: recv_nb()"); + "orte_rml_ftrm: recv_nb(%s, %d, %d, %d )", + ORTE_NAME_PRINT(peer), count, tag, flags); - if( NULL != wrapped_module.recv_nb ) { - if( ORTE_SUCCESS != (ret = wrapped_module.recv_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.recv_nb ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_nb(peer, msg, count, tag, flags, cbfunc, cbdata) ) ) { return ret; } } @@ -329,10 +307,11 @@ int orte_rml_ftrm_recv_buffer(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: recv_buffer()"); + "orte_rml_ftrm: recv_buffer(%s, %d )", + ORTE_NAME_PRINT(peer), tag); - if( NULL != wrapped_module.recv_buffer ) { - if( ORTE_SUCCESS != (ret = wrapped_module.recv_buffer(peer, buf, tag, flags) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.recv_buffer ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_buffer(peer, buf, tag, flags) ) ) { return ret; } } @@ -352,10 +331,11 @@ int orte_rml_ftrm_recv_buffer_nb(orte_process_name_t* peer, int ret; opal_output_verbose(20, rml_ftrm_output_handle, - "orte_rml_ftrm: recv_buffer_nb()"); + "orte_rml_ftrm: recv_buffer_nb(%s, %d, %d)", + ORTE_NAME_PRINT(peer), tag, flags); - if( NULL != wrapped_module.recv_buffer_nb ) { - if( ORTE_SUCCESS != (ret = wrapped_module.recv_buffer_nb(peer, tag, flags, cbfunc, cbdata) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.recv_buffer_nb ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_buffer_nb(peer, tag, flags, cbfunc, cbdata) ) ) { return ret; } } @@ -373,8 +353,8 @@ int orte_rml_ftrm_recv_cancel(orte_process_name_t* peer, orte_rml_tag_t tag) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: recv_cancel()"); - if( NULL != wrapped_module.recv_cancel ) { - if( ORTE_SUCCESS != (ret = wrapped_module.recv_cancel(peer, tag) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.recv_cancel ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.recv_cancel(peer, tag) ) ) { return ret; } } @@ -393,8 +373,8 @@ int orte_rml_ftrm_add_exception_handler(orte_rml_exception_callback_t cbfunc) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: add_exception_handler()"); - if( NULL != wrapped_module.add_exception_handler ) { - if( ORTE_SUCCESS != (ret = wrapped_module.add_exception_handler(cbfunc) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.add_exception_handler ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.add_exception_handler(cbfunc) ) ) { return ret; } } @@ -409,8 +389,8 @@ int orte_rml_ftrm_del_exception_handler(orte_rml_exception_callback_t cbfunc) opal_output_verbose(20, rml_ftrm_output_handle, "orte_rml_ftrm: del_exception_handler()"); - if( NULL != wrapped_module.del_exception_handler ) { - if( ORTE_SUCCESS != (ret = wrapped_module.del_exception_handler(cbfunc) ) ) { + if( NULL != orte_rml_ftrm_wrapped_module.del_exception_handler ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.del_exception_handler(cbfunc) ) ) { return ret; } } @@ -447,8 +427,8 @@ int orte_rml_ftrm_ft_event(int state) /* * The wrapped component is responsible for calling the OOB modules */ - if( NULL != wrapped_module.ft_event ) { - if( ORTE_SUCCESS != (ret = wrapped_module.ft_event(state))) { + if( NULL != orte_rml_ftrm_wrapped_module.ft_event ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.ft_event(state))) { return ret; } } diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 826cec90b9..3a9f980224 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -324,6 +324,19 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool * pid_t my_pid; bool ack = true; + /* + * Do not send to self, as that is silly. + */ + if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_HNP) || + 0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME) ) { + opal_output_verbose(10, orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: Do not send to self!\n"); + return ORTE_SUCCESS; + } + + opal_output_verbose(10, orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Sending commands\n"); + /* * Setup the buffer that we may send back */ @@ -337,21 +350,33 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool * ********************/ my_pid = getpid(); if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &my_pid, 1, ORTE_PID))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: DSS Pack (PID) failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: Send Buffer (PID) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } /* ACK */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: ACK (PID) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ORTE_ERROR; goto cleanup; } if( !ack ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: NACK (PID) (LINE = %d)\n", + __LINE__); exit_status = ORTE_ERROR; goto cleanup; } @@ -366,12 +391,18 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool * } if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: Recv (term) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } n = 1; if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, term, &n, ORTE_BOOL)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: DSS Unpack (term) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } @@ -386,12 +417,18 @@ int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer, bool * } if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: Recv (jobid) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } n = 1; if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, jobid, &n, ORTE_SIZE)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } @@ -413,11 +450,17 @@ int orte_snapc_base_global_coord_recv_ack(orte_process_name_t* peer, bool *ack) } if( 0 > (ret = orte_rml.recv_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: recv_ack: Error: Recv Failed: %d\n", + ret); exit_status = ret; goto cleanup; } if ( ORTE_SUCCESS != (ret = orte_dss.unpack(loc_buffer, ack, &n, ORTE_BOOL)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: recv_ack: Error: Unpack Failed: %d\n", + ret); exit_status = ret; goto cleanup; } @@ -439,11 +482,17 @@ int orte_snapc_base_global_coord_send_ack(orte_process_name_t* peer, bool ack) } if (ORTE_SUCCESS != (ret = orte_dss.pack(buffer, &ack, 1, ORTE_BOOL))) { + opal_output(orte_snapc_base_output, + "snapc:base: send_ack: Error: Pack Failed: %d\n", + ret); exit_status = ret; goto cleanup; } if (0 > (ret = orte_rml.send_buffer(peer, buffer, ORTE_RML_TAG_CKPT, 0))) { + opal_output(orte_snapc_base_output, + "snapc:base: send_ack: Error: Send Failed: %d\n", + ret); exit_status = ret; goto cleanup; } @@ -462,6 +511,20 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char bool ack = true; size_t str_len = 0; + /* + * Do not send to self, as that is silly. + */ + if (0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_HNP) || + 0 == orte_ns.compare_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME) ) { + opal_output_verbose(10, orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: Do not send to self!\n"); + return ORTE_SUCCESS; + } + + opal_output_verbose(10, orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Sending update command <%s> <%d> <%d>\n", + global_snapshot_handle, seq_num, ckpt_status); + /* * Setup the buffer that we may send back */ @@ -483,20 +546,32 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char } if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &ckpt_status, 1, ORTE_INT))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } /* ACK */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: ACK (ckpt_status) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ORTE_ERROR; goto cleanup; } if( !ack ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: NACK (ckpt_status) (LINE = %d)\n", + __LINE__); exit_status = ORTE_ERROR; goto cleanup; } @@ -520,20 +595,32 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char str_len = strlen(global_snapshot_handle); if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &str_len, 1, ORTE_SIZE))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: DSS Pack (snapshot ref length) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: Send (snapshot ref length) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } /* ACK */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: Send (snapshot ref length) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ORTE_ERROR; goto cleanup; } if( !ack ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: NACK (snapshot ref length) (LINE = %d)\n", + __LINE__); exit_status = ORTE_ERROR; goto cleanup; } @@ -551,24 +638,39 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, char } if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &global_snapshot_handle, 1, ORTE_STRING))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_dss.pack(loc_buffer, &seq_num, 1, ORTE_INT))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } if (0 > (ret = orte_rml.send_buffer(peer, loc_buffer, ORTE_RML_TAG_CKPT, 0))) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: Send (snapshot handle, seq number) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ret; goto cleanup; } /* ACK */ if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_recv_ack(peer, &ack)) ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: ACK (snapshot handle, seq number) Failure (ret = %d) (LINE = %d)\n", + ret, __LINE__); exit_status = ORTE_ERROR; goto cleanup; } if( !ack ) { + opal_output(orte_snapc_base_output, + "snapc:base: ckpt_update_cmd: Error: NACK (snapshot handle, seq number) (LINE = %d)\n", + __LINE__); exit_status = ORTE_ERROR; goto cleanup; } diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index 808260f868..e8a665a9e8 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -82,10 +82,11 @@ static bool snapc_full_global_is_done_yet(void); static opal_mutex_t global_coord_mutex; static orte_snapc_base_global_snapshot_t global_snapshot; -static orte_process_name_t orte_checkpoint_sender; +static orte_process_name_t orte_checkpoint_sender = {0,0}; static bool updated_job_to_running; static size_t cur_job_ckpt_state = ORTE_SNAPC_CKPT_STATE_NONE; +static orte_jobid_t cur_job_id = 0; /************************ * Function Definitions @@ -109,6 +110,42 @@ int global_coord_setup_job(orte_jobid_t jobid) { orte_vpid_t vpid_start = 0, vpid_range = 0; orte_std_cntr_t i; + /* + * If we have already setup a jobid, warn + * JJH: Hard restriction of only one jobid able to be checkpointed. FIX + */ + /* + * If we pass this way twice the first time will have been from: + * rmgr_urm.c: As the global coordinator + * The second time will have been from: + * odls_default_module.c: As the local coordinator. + * The later case means that we (as the HNP) are acting as both the global and + * local coordinators. + * JJH FIX NOTE: + * This fix imposes the restriction that only one jobid can be checkpointed + * at a time. In the future we will want to lift this restriction. + */ + if( 0 >= cur_job_id ) { + /* Global Coordinator pass */ + cur_job_id = jobid; + } + else if ( jobid == cur_job_id ) { + /* Local Coordinator pass -- Will always happen after Global Coordinator Pass */ + opal_output_verbose(10, mca_snapc_full_component.super.output_handle, + "global [%d]) Setup job (%d) again as the local coordinator for (%d)\n", + getpid(), jobid, cur_job_id); + return local_coord_setup_job(jobid); + } + else { + /* Already setup things for another job, + * We do not currently support the ability to checkpoint more than one + * jobid + */ + opal_output(mca_snapc_full_component.super.output_handle, + "global [%d]) Setup job (%d) Failed. Already setup job (%d)\n", getpid(), jobid, cur_job_id); + return ORTE_ERROR; + } + /* * Start out with a sequence number just below the first * This will be incremented when we checkpoint diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c index 135fa00254..7f5f594db9 100644 --- a/orte/tools/orte-restart/orte-restart.c +++ b/orte/tools/orte-restart/orte-restart.c @@ -426,7 +426,7 @@ static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot) vpid_snapshot = (orte_snapc_base_snapshot_t*)item; fprintf(appfile, "#\n"); - fprintf(appfile, "# Old Process Name: %%u.%u\n", + fprintf(appfile, "# Old Process Name: %u.%u\n", vpid_snapshot->process_name.jobid, vpid_snapshot->process_name.vpid); fprintf(appfile, "#\n");