From 9c78e9075e41330f4d25b93f8d2eff5bdd11e3e3 Mon Sep 17 00:00:00 2001 From: Peter Karalekas Date: Sat, 18 Nov 2023 09:57:35 -0800 Subject: [PATCH 1/5] Pass the debug? flag when sowing nodes --- src/dryad.lisp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dryad.lisp b/src/dryad.lisp index ddeb576..fd7f85e 100644 --- a/src/dryad.lisp +++ b/src/dryad.lisp @@ -56,7 +56,8 @@ NOTE: In the basic implementation, these messages must be waiting for the DRYAD (let* ((node-id (message-sow-id message)) (node-process (spawn-process (dryad-node-class dryad) :dryad (process-public-address dryad) - :id node-id)) + :id node-id + :debug? (process-debug? dryad))) (node-address (process-public-address node-process))) (schedule node-process now) (setf (gethash node-address (dryad-ids dryad)) node-id From 2ca0b89bc9b1e36acbfbd924d7bab6ad39bcca70 Mon Sep 17 00:00:00 2001 From: Peter Karalekas Date: Sat, 18 Nov 2023 09:58:40 -0800 Subject: [PATCH 2/5] Pull one-root cluster aborting check into the root collection step --- src/operations/multireweight.lisp | 83 ++++++++++++++++--------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/src/operations/multireweight.lisp b/src/operations/multireweight.lisp index 83469ee..6cca56d 100644 --- a/src/operations/multireweight.lisp +++ b/src/operations/multireweight.lisp @@ -51,55 +51,50 @@ ;;; (define-process-upkeep ((supervisor supervisor) now) (START-MULTIREWEIGHT pong) - "Sets up the multireweight procedure. - -1. Collect the mutually held roots for the `HOLD-CLUSTER' -2. Lock the `HOLD-CLUSTER' and check the rootiness of each root. -3. Change the pingability of the cluster to `:SOFT'. -4. Scan the `HOLD-CLUSTER' for the best external rec to use for reweighting. -5. Reweight the `HOLD-CLUSTER' according to the recommendation. -6. Check to see if the `HOLD-CLUSTER' should be rewound, and do so if need be. -7. Unlock the targets and tear down transient state." + "Sets up the multireweight procedure by first collecting mutually-held roots, which form the `HOLD-CLUSTER'." ;; NOTE: we couldn't call MAKE-PONG even if we wanted to, since we don't have ;; access to the underlying node's Lisp object (or its type). (push (make-data-frame-multireweight :internal-pong nil) (process-data-stack supervisor)) (with-slots (root-bucket source-root) pong (setf root-bucket (remove-duplicates root-bucket :test #'address=)) - (process-continuation supervisor - `(CONVERGECAST-COLLECT-ROOTS ,(list source-root) ,root-bucket) - `(CHECK-PRIORITY ,source-root) - `(START-INNER-MULTIREWEIGHT) - `(FINISH-MULTIREWEIGHT) - `(HALT)))) + (process-continuation supervisor `(CONVERGECAST-COLLECT-ROOTS ,source-root + ,root-bucket)))) (define-process-upkeep ((supervisor supervisor) now) - (CONVERGECAST-COLLECT-ROOTS cluster roots) - "Recursively collects the `HELD-BY-ROOTS' values of `ROOTS' to determine the set of roots that are participating in this `HOLD' cluster (meaning that they are mutually held by each other), starting with a base `CLUSTER' of just the `SOURCE-ROOT'. If any replies are NIL, we abort." - (with-slots (hold-cluster) (peek (process-data-stack supervisor)) - (flet ((payload-constructor () - (make-message-convergecast-collect-roots :hold-cluster cluster))) - (with-replies (replies :returned? returned?) - (send-message-batch #'payload-constructor roots) - (when (some #'null replies) - (log-entry :entry-type 'aborting-multireweight - :reason 'root-collection-failed - :hold-cluster cluster - :held-by-roots roots) - (setf (process-lockable-aborting? supervisor) t) - (finish-with-scheduling)) - (setf hold-cluster (reduce #'address-union (list* cluster replies))))))) + (CONVERGECAST-COLLECT-ROOTS source-root root-bucket) + "Recursively collects the `HELD-BY-ROOTS' values of `ROOT-BUCKET' to determine the set of roots that are participating in this `HOLD' cluster (meaning that they are mutually held by each other), starting with a base `CLUSTER' of just the `SOURCE-ROOT'. If any replies are NIL, we abort." + (let ((cluster (list source-root))) + (with-slots (hold-cluster) (peek (process-data-stack supervisor)) + (flet ((payload-constructor () + (make-message-convergecast-collect-roots :hold-cluster cluster))) + (with-replies (replies :returned? returned?) + (send-message-batch #'payload-constructor root-bucket) + (when (some #'null replies) + (log-entry :entry-type 'aborting-multireweight + :reason 'root-collection-failed + :hold-cluster cluster + :held-by-roots root-bucket) + (setf (process-lockable-aborting? supervisor) t) + (finish-with-scheduling)) + (setf hold-cluster (reduce #'address-union (list* cluster replies))) + ;; don't bother _multi_reweighting if we're in a cluster of 1. + (when (endp (rest hold-cluster)) + (log-entry :entry-type 'aborting-multireweight + :reason 'cluster-of-one + :hold-cluster hold-cluster) + (setf (process-lockable-aborting? supervisor) t) + (finish-with-futures)) + ;; otherwise, push the next set of commands onto the stack + (process-continuation supervisor + `(CHECK-PRIORITY ,source-root) ; TODO: add 2nd arg + `(START-INNER-MULTIREWEIGHT) + `(FINISH-MULTIREWEIGHT) + `(HALT))))))) (define-process-upkeep ((supervisor supervisor) now) (CHECK-PRIORITY original-root) "Confirm that, of the roots in the hold cluster, we have priority to act. Namely, we have priority when our `ORIGINAL-ROOT' carries the minimum ID of all the roots in the cluster." (with-slots (hold-cluster) (peek (process-data-stack supervisor)) - ;; don't bother _multi_reweighting if we're in a cluster of 1. - (when (endp (rest hold-cluster)) - (log-entry :entry-type 'aborting-multireweight - :reason 'cluster-of-one - :hold-cluster hold-cluster) - (setf (process-lockable-aborting? supervisor) t) - (finish-with-futures)) (sync-rpc (make-message-id-query) (original-id original-root) (with-replies (replies) @@ -109,7 +104,17 @@ (setf (process-lockable-aborting? supervisor) t))))))) (define-process-upkeep ((supervisor supervisor) now) (START-INNER-MULTIREWEIGHT) - "This is the start of the \"critical segment\", where it begins to be impossible to rewind partway through the modifications we're about to make." + "This is the start of the \"critical segment\", where it begins to be impossible to rewind partway through the modifications we're about to make. + +1. Lock the `HOLD-CLUSTER'. +2. Check that each root in the `HOLD-CLUSTER' is still a root. +3. Change the pingability of the cluster to `:SOFT'. +4. Scan the `HOLD-CLUSTER' for the best external rec to use for reweighting. +5. Change the pingability of the cluster to `:NONE'. +6. Reweight the `HOLD-CLUSTER' according to the recommendation. +7. Change the pingability of the cluster to `:SOFT'. +8. Check to see if the `HOLD-CLUSTER' should be rewound, and do so if need be. +9. Unlock the targets and tear down transient state." (with-slots (hold-cluster) (peek (process-data-stack supervisor)) (cond ((not (process-lockable-aborting? supervisor)) @@ -122,7 +127,7 @@ `(MULTIREWEIGHT-BROADCAST-REWEIGHT ,hold-cluster) `(BROADCAST-PINGABILITY ,hold-cluster :SOFT) `(MULTIREWEIGHT-CHECK-REWINDING ,hold-cluster) - `(BROADCAST-UNLOCK))) ; don't destroy trees + `(BROADCAST-UNLOCK))) ; don't destroy trees (t (log-entry :entry-type 'aborting-multireweight :reason 'previously-aborted From 10a77a8a298584f0adc2a6e40e65f1c0819465d1 Mon Sep 17 00:00:00 2001 From: Peter Karalekas Date: Sat, 18 Nov 2023 10:11:34 -0800 Subject: [PATCH 3/5] Pass target-roots to CHECK-PRIORITY --- src/operations/multireweight.lisp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/operations/multireweight.lisp b/src/operations/multireweight.lisp index 6cca56d..6c71a07 100644 --- a/src/operations/multireweight.lisp +++ b/src/operations/multireweight.lisp @@ -63,7 +63,7 @@ (define-process-upkeep ((supervisor supervisor) now) (CONVERGECAST-COLLECT-ROOTS source-root root-bucket) - "Recursively collects the `HELD-BY-ROOTS' values of `ROOT-BUCKET' to determine the set of roots that are participating in this `HOLD' cluster (meaning that they are mutually held by each other), starting with a base `CLUSTER' of just the `SOURCE-ROOT'. If any replies are NIL, we abort." + "Recursively collects the `HELD-BY-ROOTS' values of `ROOT-BUCKET' to determine the set of roots that are participating in this `HOLD' cluster (meaning that they are mutually held by each other), starting with a base `cluster' of just the `SOURCE-ROOT'. If any replies are NIL, we abort." (let ((cluster (list source-root))) (with-slots (hold-cluster) (peek (process-data-stack supervisor)) (flet ((payload-constructor () @@ -87,20 +87,20 @@ (finish-with-futures)) ;; otherwise, push the next set of commands onto the stack (process-continuation supervisor - `(CHECK-PRIORITY ,source-root) ; TODO: add 2nd arg + `(CHECK-PRIORITY ,source-root ,hold-cluster) `(START-INNER-MULTIREWEIGHT) `(FINISH-MULTIREWEIGHT) `(HALT))))))) -(define-process-upkeep ((supervisor supervisor) now) (CHECK-PRIORITY original-root) - "Confirm that, of the roots in the hold cluster, we have priority to act. Namely, we have priority when our `ORIGINAL-ROOT' carries the minimum ID of all the roots in the cluster." - (with-slots (hold-cluster) (peek (process-data-stack supervisor)) - (sync-rpc (make-message-id-query) - (original-id original-root) +(define-process-upkeep ((supervisor supervisor) now) + (CHECK-PRIORITY source-root target-roots) + "Confirm that, of the roots in the hold cluster, we have priority to act. Namely, we have priority when our `SOURCE-ROOT' carries the minimum ID (i.e. coordinate) of all the roots in the `hold-cluster' (passed as `TARGET-ROOTS')." + (let ((hold-cluster target-roots)) + (sync-rpc (make-message-id-query) (source-id source-root) (with-replies (replies) (send-message-batch #'make-message-id-query hold-cluster) (let ((cluster-id (reduce #'min-id replies))) - (unless (equalp original-id (min-id original-id cluster-id)) + (unless (equalp source-id (min-id source-id cluster-id)) (setf (process-lockable-aborting? supervisor) t))))))) (define-process-upkeep ((supervisor supervisor) now) (START-INNER-MULTIREWEIGHT) From 2c7b7c2a731f8e5afe57d638283d779d29e1f1d7 Mon Sep 17 00:00:00 2001 From: Peter Karalekas Date: Sat, 18 Nov 2023 10:35:03 -0800 Subject: [PATCH 4/5] Don't forget to HALT... --- src/operations/multireweight.lisp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operations/multireweight.lisp b/src/operations/multireweight.lisp index 6c71a07..0a717fd 100644 --- a/src/operations/multireweight.lisp +++ b/src/operations/multireweight.lisp @@ -58,8 +58,9 @@ (process-data-stack supervisor)) (with-slots (root-bucket source-root) pong (setf root-bucket (remove-duplicates root-bucket :test #'address=)) - (process-continuation supervisor `(CONVERGECAST-COLLECT-ROOTS ,source-root - ,root-bucket)))) + (process-continuation supervisor + `(CONVERGECAST-COLLECT-ROOTS ,source-root ,root-bucket) + `(HALT)))) (define-process-upkeep ((supervisor supervisor) now) (CONVERGECAST-COLLECT-ROOTS source-root root-bucket) @@ -89,8 +90,7 @@ (process-continuation supervisor `(CHECK-PRIORITY ,source-root ,hold-cluster) `(START-INNER-MULTIREWEIGHT) - `(FINISH-MULTIREWEIGHT) - `(HALT))))))) + `(FINISH-MULTIREWEIGHT))))))) (define-process-upkeep ((supervisor supervisor) now) (CHECK-PRIORITY source-root target-roots) From 439ccb45979bed7624beb83af11fee41f1421e6f Mon Sep 17 00:00:00 2001 From: Peter Karalekas Date: Sat, 18 Nov 2023 10:37:40 -0800 Subject: [PATCH 5/5] Or to pop off the data frame... --- src/operations/multireweight.lisp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operations/multireweight.lisp b/src/operations/multireweight.lisp index 0a717fd..816232e 100644 --- a/src/operations/multireweight.lisp +++ b/src/operations/multireweight.lisp @@ -60,6 +60,7 @@ (setf root-bucket (remove-duplicates root-bucket :test #'address=)) (process-continuation supervisor `(CONVERGECAST-COLLECT-ROOTS ,source-root ,root-bucket) + `(FINISH-MULTIREWEIGHT) `(HALT)))) (define-process-upkeep ((supervisor supervisor) now) @@ -89,8 +90,7 @@ ;; otherwise, push the next set of commands onto the stack (process-continuation supervisor `(CHECK-PRIORITY ,source-root ,hold-cluster) - `(START-INNER-MULTIREWEIGHT) - `(FINISH-MULTIREWEIGHT))))))) + `(START-INNER-MULTIREWEIGHT))))))) (define-process-upkeep ((supervisor supervisor) now) (CHECK-PRIORITY source-root target-roots)