diff --git a/orte/mca/gpr/replica/communications/gpr_replica_remote_msg.c b/orte/mca/gpr/replica/communications/gpr_replica_remote_msg.c index 28b00e82e7..4d81c8e3d4 100644 --- a/orte/mca/gpr/replica/communications/gpr_replica_remote_msg.c +++ b/orte/mca/gpr/replica/communications/gpr_replica_remote_msg.c @@ -80,9 +80,27 @@ int orte_gpr_replica_remote_notify(orte_process_name_t *recipient, if (0 > orte_rml.send_buffer_nb(recipient, buffer, ORTE_RML_TAG_GPR_NOTIFY, 0, orte_gpr_replica_remote_send_cb, NULL)) { +#if 0 + /* temporarily disable this error report + * With the new orted-failed-to-start code, we hold a caller in + * the rmgr.spawn function until either the app launches or + * it fails. Failure is indicated by a subscription to NUM_TERMINATED. + * However, that means that a notify_msg is going to get sent to a + * remote process during comm_spawn once all procs terminate. Since + * that process will have terminated, and the HNP processes the trigger + * first, the notify_msg send will fail as the recipient will have + * terminated and exited. + * + * A proper fix will require that we do something different + * in rmgr_proxy.spawn so we don't get a callback after the + * process is done + */ ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); + opal_output(0, "send failed to [%ld,%ld,%ld]", ORTE_NAME_ARGS(recipient)); + orte_dss.dump(0, message, ORTE_GPR_NOTIFY_MSG); OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex); return ORTE_ERR_COMM_FAILURE; +#endif } OPAL_THREAD_LOCK(&orte_gpr_replica_globals.mutex);