From 3c4c84017162498bdbbed98dc8f5b33a5a9aa7e3 Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Fri, 12 Apr 2024 12:38:52 -0700
Subject: [PATCH 1/5] broker: fix shadowed variable

Problem: parent_cb() defines 'type', then redefines it in
one of its blocks.

Rename the variable in the block to avoid this.
---
 src/broker/overlay.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/broker/overlay.c b/src/broker/overlay.c
index 7b2e80ca015e..47b870f939e4 100644
--- a/src/broker/overlay.c
+++ b/src/broker/overlay.c
@@ -1013,11 +1013,11 @@ static void parent_cb (flux_reactor_t *r,
             flux_msg_route_disable (msg);
             break;
         case FLUX_MSGTYPE_CONTROL: {
-            int type, reason;
-            if (flux_control_decode (msg, &type, &reason) < 0) {
+            int ctrl_type, reason;
+            if (flux_control_decode (msg, &ctrl_type, &reason) < 0) {
                 logdrop (ov, OVERLAY_UPSTREAM, msg, "malformed control");
             }
-            else if (type == CONTROL_DISCONNECT) {
+            else if (ctrl_type == CONTROL_DISCONNECT) {
                 flux_log (ov->h, LOG_CRIT,
                           "%s (rank %lu) sent disconnect control message",
                           flux_get_hostbyrank (ov->h, ov->parent.rank),

From 24b2c822c9c896cf19a38ae8bb1997bd9325ea34 Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Fri, 12 Apr 2024 12:41:13 -0700
Subject: [PATCH 2/5] broker: honor control disconnect during hello

Problem: if the parent broker reboots after hello was sent
but before a response is received, the child broker is stuck
unable to connect.

As noted in #5881, we see a continuous stream of log messages on
the client:

   DROP upstream control topic - :
     message received before hello handshake completed

The messages may get started because
1. child sends hello
2. parent reboots and misses the hello
3. child starts sending periodic heartbeat control message to
   the parent after 5s regardless of the hello handshake status
   (this is by design - see below)
4. parent expects to get a hello request before any other message,
   so it sends a disconnect control message
5. child logs the above message (which goes to the parent)
   but doesn't disconnect
6. parent recieves log message, goto 4

Note that although the broker's downstream 0MQ ROUTER socket exposes
TCP disconnects as you might expect for a regular socket, the upstream
0MQ DEALER socket does not.  This is why the child does not detect the
parent restart after sending hello - it has seamlessly reconnected to
the new parent but the hello request that was likely safely delivered to
the old parent before it died and is gone.

The design to work around this passive reconnect behavior is to start
the periodic heartbeat control messages early and force something to happen.
The parent does the right thing by sending the disconnect in response to
the heartbeat.  Unfortunatley, the child ignores the disconnect and a
fast but unentertaining game of pingpong ensues.

To break the cycle, handle the control disconnect message in the child
as originally intended.  That will cause the broker to be restarted by
systemd and introductions can be restarted from scratch.

This first noted in #5881.
---
 src/broker/overlay.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/broker/overlay.c b/src/broker/overlay.c
index 47b870f939e4..26652ffacacd 100644
--- a/src/broker/overlay.c
+++ b/src/broker/overlay.c
@@ -993,12 +993,13 @@ static void parent_cb (flux_reactor_t *r,
             && flux_msg_get_topic (msg, &topic) == 0
             && streq (topic, "overlay.hello")) {
             hello_response_handler (ov, msg);
+            goto done;
         }
-        else {
+        else if (type != FLUX_MSGTYPE_CONTROL) {
             logdrop (ov, OVERLAY_UPSTREAM, msg,
                      "message received before hello handshake completed");
+            goto done;
         }
-        goto done;
     }
     switch (type) {
         case FLUX_MSGTYPE_RESPONSE:

From 92338733743670e287ab7676e3e1a223df37f81f Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Fri, 12 Apr 2024 14:03:09 -0700
Subject: [PATCH 3/5] broker: prevent new clients during shutdown

Problem: if an instance is slow to shut down, nodes could restart
and rejoin the dying instance.

Set a flag when the broker enters cleanup state that causes any
new clients that send a hello message to get an immediate control
disconnect.
---
 src/broker/overlay.c       | 16 +++++++++++-----
 src/broker/overlay.h       |  5 ++++-
 src/broker/state_machine.c | 12 +++++++++++-
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/broker/overlay.c b/src/broker/overlay.c
index 26652ffacacd..eabc879c6cee 100644
--- a/src/broker/overlay.c
+++ b/src/broker/overlay.c
@@ -170,6 +170,7 @@ struct overlay {
 
     struct parent parent;
 
+    bool shutdown_in_progress;  // no new downstream connections permitted
     void *bind_zsock;           // NULL if no downstream peers
     char *bind_uri;
     flux_watcher_t *bind_w;
@@ -900,7 +901,8 @@ static void child_cb (flux_reactor_t *r,
          */
         if (type == FLUX_MSGTYPE_REQUEST
             && flux_msg_get_topic (msg, &topic) == 0
-            && streq (topic, "overlay.hello")) {
+            && streq (topic, "overlay.hello")
+            && !ov->shutdown_in_progress) {
             hello_request_handler (ov, msg);
         }
         /* Or one of the following cases occurred that requires (or at least
@@ -912,6 +914,7 @@ static void child_cb (flux_reactor_t *r,
          *    the DISCONNECT and connectivity has been restored.
          * 3) This is a new-to-us peer because *we* restarted without getting
          *    a message through (e.g. crash)
+         * 4) A peer said hello while shutdown is in progress
          * Control send failures may occur, see flux-framework/flux-core#4464.
          * Don't log here, see flux-framework/flux-core#4180.
          */
@@ -1410,11 +1413,14 @@ int overlay_bind (struct overlay *ov, const char *uri)
 
 /* Don't allow downstream peers to reconnect while we are shutting down.
  */
-void overlay_shutdown (struct overlay *overlay)
+void overlay_shutdown (struct overlay *overlay, bool unbind)
 {
-    if (overlay->bind_zsock && overlay->bind_uri)
-        if (zmq_unbind (overlay->bind_zsock, overlay->bind_uri) < 0)
-            flux_log (overlay->h, LOG_ERR, "zmq_unbind failed");
+    overlay->shutdown_in_progress = true;
+    if (unbind) {
+        if (overlay->bind_zsock && overlay->bind_uri)
+            if (zmq_unbind (overlay->bind_zsock, overlay->bind_uri) < 0)
+                flux_log (overlay->h, LOG_ERR, "zmq_unbind failed");
+    }
 }
 
 /* Call after overlay bootstrap (bind/connect),
diff --git a/src/broker/overlay.h b/src/broker/overlay.h
index af40ebb72082..445dc812b349 100644
--- a/src/broker/overlay.h
+++ b/src/broker/overlay.h
@@ -150,8 +150,11 @@ int overlay_set_monitor_cb (struct overlay *ov,
 int overlay_register_attrs (struct overlay *overlay);
 
 /* Stop allowing new connections from downstream peers.
+ * If unbind is false, stop all communication on the socket.
+ * Otherwise arrange to send a disconnect control message in response
+ * to all messages.
  */
-void overlay_shutdown (struct overlay *overlay);
+void overlay_shutdown (struct overlay *overlay, bool unbind);
 
 #endif /* !_BROKER_OVERLAY_H */
 
diff --git a/src/broker/state_machine.c b/src/broker/state_machine.c
index 48d12632145f..a806ea1256eb 100644
--- a/src/broker/state_machine.c
+++ b/src/broker/state_machine.c
@@ -360,6 +360,12 @@ static void action_run (struct state_machine *s)
 
 static void action_cleanup (struct state_machine *s)
 {
+    /* Prevent new downstream clients from saying hello, but
+     * let existing ones continue to communiate so they can
+     * shut down and disconnect.
+     */
+    overlay_shutdown (s->ctx->overlay, false);
+
     if (runat_is_defined (s->ctx->runat, "cleanup")) {
         if (runat_start (s->ctx->runat, "cleanup", runat_completion_cb, s) < 0) {
             flux_log_error (s->ctx->h, "runat_start cleanup");
@@ -372,7 +378,11 @@ static void action_cleanup (struct state_machine *s)
 
 static void action_finalize (struct state_machine *s)
 {
-    overlay_shutdown (s->ctx->overlay);
+    /* Now that all clients have disconnected, finalize all
+     * downstream communication.
+     */
+    overlay_shutdown (s->ctx->overlay, true);
+
     if (runat_is_defined (s->ctx->runat, "rc3")) {
         if (runat_start (s->ctx->runat, "rc3", runat_completion_cb, s) < 0) {
             flux_log_error (s->ctx->h, "runat_start rc3");

From d4fa3dc1706dc2a226c9a2e2bea7466f9ce423b5 Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Fri, 12 Apr 2024 14:05:56 -0700
Subject: [PATCH 4/5] systemd: change RestartSec=5s to 30s

Problem: when nodes of a system instance are forcibly disconnected,
they can reconnect fairly quickly because we allow systemd to
restart them in 5s.

While Flux is shutting down, hello requests are now immediately sent
a control disconnect message, but if the fanout is large and the
shutdown is slow, the whack-a-mole overhead may be non-negligible.

Raise the systemd unit file RestartSec value from 5s to 30s.
---
 etc/flux.service.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/flux.service.in b/etc/flux.service.in
index f992187f3224..fb0d57793741 100644
--- a/etc/flux.service.in
+++ b/etc/flux.service.in
@@ -29,7 +29,7 @@ ExecStart=/bin/bash -c '\
 SyslogIdentifier=flux
 ExecReload=@X_BINDIR@/flux config reload
 Restart=always
-RestartSec=5s
+RestartSec=30s
 RestartPreventExitStatus=42
 SuccessExitStatus=42
 User=flux

From 61bb419c1bc9685ba8cfe072d7433b695d3ab114 Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Fri, 12 Apr 2024 14:35:57 -0700
Subject: [PATCH 5/5] broker: improve LOST error message

Problem: many "transitioning to LOST due to EHOSTUNREACH error on send"
messages were logged during shutdown of a large instance.

This is still not well understood but we can perhaps get a little more
information for next time.

Add the previous state and the time the peer has spent in that
state (in whole seconds).

Hopefully will help with #5881 if it occurs again.
---
 src/broker/overlay.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/broker/overlay.c b/src/broker/overlay.c
index eabc879c6cee..dfd77c9c4199 100644
--- a/src/broker/overlay.c
+++ b/src/broker/overlay.c
@@ -777,9 +777,12 @@ static int overlay_sendmsg_child (struct overlay *ov, const flux_msg_t *msg)
             && (child = child_lookup_online (ov, uuid))) {
             flux_log (ov->h,
                       LOG_ERR,
-                      "%s (rank %d) transitioning to LOST due to %s",
+                      "%s (rank %d) transitioning to %s->LOST"
+                      " after %ds due to %s",
                       flux_get_hostbyrank (ov->h, child->rank),
                       (int)child->rank,
+                      subtree_status_str (child->status),
+                      (int)(monotime_since (child->status_timestamp) / 1000.0),
                       "EHOSTUNREACH error on send");
             overlay_child_status_update (ov, child, SUBTREE_STATUS_LOST);
         }