From 40beae67490cd2f03c597c5f13a6c2a63b574fcb Mon Sep 17 00:00:00 2001
From: Tim Nelson <tn@cs.brown.edu>
Date: Thu, 29 Feb 2024 17:43:49 -0500
Subject: [PATCH] Fix: better handling for multiple-tests (#245)

---
 forge/send-to-kodkod.rkt                      |   5 +
 forge/sigs-structs.rkt                        |  46 ++++--
 forge/sigs.rkt                                | 137 ++++++++++--------
 forge/tests/error/bsl_multiple_failures.frg   |   2 +-
 .../multiple-positive-examples-failing.frg    |   1 +
 forge/tests/error/multiple_test_failures.frg  |   1 +
 ...d_tree_underconstraint_multiple_errors.frg |   1 +
 forge/tests/error/unstated_bounds.frg         |  11 ++
 .../forge-core/other/test_behavior_first.rkt  |  42 ++++++
 .../forge-core/other/test_behavior_last.rkt   |  44 ++++++
 10 files changed, 223 insertions(+), 67 deletions(-)
 create mode 100644 forge/tests/error/unstated_bounds.frg
 create mode 100644 forge/tests/forge-core/other/test_behavior_first.rkt
 create mode 100644 forge/tests/forge-core/other/test_behavior_last.rkt

diff --git a/forge/send-to-kodkod.rkt b/forge/send-to-kodkod.rkt
index 66a6c988..afcd9f63 100644
--- a/forge/send-to-kodkod.rkt
+++ b/forge/send-to-kodkod.rkt
@@ -343,8 +343,13 @@
   
   ; Print solve
   (define (get-next-model [mode ""])
+    ; If the solver isn't running at all, error:
     (unless (is-running?)
       (raise-user-error "KodKod server is not running."))
+    ; If the solver is running, but this specific run ID is closed, user error
+    (when (is-run-closed? run-name)
+      (raise-user-error (format "Run ~a has been closed." run-name)))
+    
     (pardinus-print (pardinus:solve run-name mode))
     (define result (translate-from-kodkod-cli
                     'run 
diff --git a/forge/sigs-structs.rkt b/forge/sigs-structs.rkt
index 9b14dbcd..de68a087 100644
--- a/forge/sigs-structs.rkt
+++ b/forge/sigs-structs.rkt
@@ -169,6 +169,7 @@
   [run_sterling (or/c string? symbol?)]
   [sterling_port nonnegative-integer?]
   [engine_verbosity nonnegative-integer?]
+  [test_keep symbol?]
   ) #:transparent)
 
 (struct/contract State (
@@ -209,7 +210,7 @@
   [name symbol?]
   [command syntax?]
   [run-spec Run-spec?]
-  [result tree:node?] ; TODO: specify
+  [result tree:node?]
   [server-ports Server-ports?]
   [atoms (listof (or/c symbol? number?))]
   [kodkod-currents Kodkod-current?]
@@ -224,7 +225,8 @@
 (define DEFAULT-SIG-SCOPE (Range 0 4))
 ; an engine_verbosity of 1 logs SEVERE level in the Java engine;
 ;   this will send back info about crashes, but shouldn't spam (and possibly overfill) stderr.
-(define DEFAULT-OPTIONS (Options 'surface 'SAT4J 'pardinus 20 0 0 1 5 'default 'close-noretarget 'fast 0 'off 'on 0 1))
+(define DEFAULT-OPTIONS (Options 'surface 'SAT4J 'pardinus 20 0 0 1 5 'default
+                                 'close-noretarget 'fast 0 'off 'on 0 1 'first))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;    Constants    ;;;;;;;
@@ -255,7 +257,11 @@
         'local_necessity Options-local_necessity
         'run_sterling Options-run_sterling
         'sterling_port Options-sterling_port
-        'engine_verbosity Options-engine_verbosity))
+        'engine_verbosity Options-engine_verbosity
+        'test_keep Options-test_keep))
+
+(define (oneof-pred lst)
+  (lambda (x) (member x lst)))
 
 (define option-types
   (hash 'eval-language symbol?
@@ -268,14 +274,14 @@
         'min_tracelength exact-positive-integer?
         'max_tracelength exact-positive-integer?
         'problem_type symbol?
-        'target_mode (lambda (x)
-                       (member x '(close_noretarget far_noretarget close_retarget far_retarget hamming_cover)))
+        'target_mode (oneof-pred '(close_noretarget far_noretarget close_retarget far_retarget hamming_cover))
         'core_minimization symbol?
         'skolem_depth exact-integer?
         'local_necessity symbol?
         'run_sterling (lambda (x) (or (symbol? x) (string? x))) ; allow for custom visualizer path
         'sterling_port exact-nonnegative-integer?
-        'engine_verbosity exact-nonnegative-integer?))
+        'engine_verbosity exact-nonnegative-integer?
+        'test_keep (oneof-pred '(first last))))
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;  Initial State  ;;;;;;;
@@ -564,24 +570,46 @@ Returns whether the given run resulted in sat or unsat, respectively.
   (assert-is-running run)
   (Server-ports-stderr (Run-server-ports run)))
 
+;;;;;;;;;;;;;;;;;;;;;;;
+; Per-run closed status
+
+; Keep track of which runs have been closed via close-run
+(define closed-run-names (box (list)))
+; Allows other modules to let this layer know a run is closed; this box
+; is referenced by the instance generator for each run.
+(define (add-closed-run-name! name)
+  (set-box! closed-run-names (cons name (unbox closed-run-names))))
+(define (is-run-closed? name-or-run)
+  (define (truthify x) (if x #t #f))
+  (truthify
+   (cond [(Run? name-or-run) 
+          (member (Run-name name-or-run) (unbox closed-run-names))]
+         [else
+          (member name-or-run (unbox closed-run-names))])))
+
 ; close-run :: Run -> void
 (define (close-run run)
   (assert-is-running run)
   (when (>= (get-verbosity) VERBOSITY_HIGH)
-        (printf "Clearing run ~a. Keeping engine process active...~n" (Run-name run)))  
+        (printf "Clearing run ~a. Keeping engine process active...~n" (Run-name run)))
+  ; Cut off this Run's ability to query the solver, since it's about to be closed
+  ; This state is referenced in the instance-generator thunk
+  (add-closed-run-name! (Run-name run))
+  
   ; Since we're using a single process now, send it instructions to clear this run
   (pardinus:cmd 
       [(get-stdin run)]
-      
       (pardinus:clear (Run-name run))))
 
 ; is-running :: Run -> Boolean
+; This reports whether the _solver server_ is running;
+; *NOT* whether an individual run is still open.
 (define (is-running? run)
   ((Server-ports-is-running? (Run-server-ports run))))
 
 (define (assert-is-running run)
   (unless (is-running? run)
-    (raise-user-error "KodKod server is not running.")))
+    (raise-user-error "KodKod/Pardinus solver process is not running.")))
 
 (require (for-syntax syntax/srcloc)) ; for these macros
 
diff --git a/forge/sigs.rkt b/forge/sigs.rkt
index 045c4c0d..f7d73312 100644
--- a/forge/sigs.rkt
+++ b/forge/sigs.rkt
@@ -1,7 +1,7 @@
 #lang racket/base
 
 (require (only-in racket/function thunk)
-         (only-in racket/list first rest empty empty? flatten remove-duplicates)
+         (only-in racket/list first rest empty empty? flatten remove-duplicates last)
          (only-in racket/pretty pretty-print)
          (prefix-in @ (only-in racket/base display max min - +)) 
          (prefix-in @ racket/set)
@@ -188,6 +188,8 @@
 (define (state-set-option state option value #:original-path [original-path #f])
   (define options (State-options state))
 
+  (unless (hash-ref option-types option #f)
+    (raise-user-error (format "No such option: ~a" option)))
   (unless ((hash-ref option-types option) value)
     (raise-user-error (format "Setting option ~a requires ~a; received ~a"
                               option (hash-ref option-types option) value)))
@@ -258,7 +260,10 @@
                     [sterling_port value])]
       [(equal? option 'engine_verbosity)
        (struct-copy Options options
-                    [engine_verbosity value])]))
+                    [engine_verbosity value])]
+      [(equal? option 'test_keep)
+       (struct-copy Options options
+                    [test_keep value])]))
 
   (struct-copy State state
                [options new-options]))
@@ -586,40 +591,32 @@
           [(member 'expected '(sat unsat))           
            #,(syntax/loc stx (run name args ...))
            (define first-instance (tree:get-value (Run-result name)))
-           (unless (equal? (if (Sat? first-instance) 'sat 'unsat) 'expected)
-             ;(when (> (get-verbosity) 0)
-             ;  (printf "Unexpected result found, with statistics and metadata:~n")
-             ;  (pretty-print first-instance))
-             ;(display name) ;; Display in Sterling since the test failed.
-             (report-test-failure
-              #:name 'name
-              #:msg (format "Failed test ~a. Expected ~a, got ~a.~a"
-                            'name 'expected (if (Sat? first-instance) 'sat 'unsat)
-                            (if (Sat? first-instance)
-                                (format " Found instance ~a" first-instance)
-                                (if (Unsat-core first-instance)
-                                    (format " Core: ~a" (Unsat-core first-instance))
-                                    "")))
-              #:context loc
-              #:instance first-instance
-              #:run name))
-           (close-run name)]
+           (if (not (equal? (if (Sat? first-instance) 'sat 'unsat) 'expected))
+               (report-test-failure
+                #:name 'name
+                #:msg (format "Failed test ~a. Expected ~a, got ~a.~a"
+                              'name 'expected (if (Sat? first-instance) 'sat 'unsat)
+                              (if (Sat? first-instance)
+                                  (format " Found instance ~a" first-instance)
+                                  (if (Unsat-core first-instance)
+                                      (format " Core: ~a" (Unsat-core first-instance))
+                                      "")))
+                #:context loc
+                #:instance first-instance
+                #:run name)
+               (close-run name))]
 
           [(equal? 'expected 'theorem)          
            #,(syntax/loc stx (check name args ...))
            (define first-instance (tree:get-value (Run-result name)))
-           (when (Sat? first-instance)
-             ;(when (> (get-verbosity) 0)
-             ;  (printf "Instance found, with statistics and metadata:~n")
-             ;  (pretty-print first-instance))
-             ;(display name) ;; Display in sterling since the test failed.
-             (report-test-failure #:name 'name
-                                  #:msg (format "Theorem ~a failed. Found instance:~n~a"
-                                                'name first-instance)
-                                  #:context loc
-                                  #:instance first-instance
-                                  #:run name))
-           (close-run name)]
+           (if (Sat? first-instance)
+               (report-test-failure #:name 'name
+                                    #:msg (format "Theorem ~a failed. Found instance:~n~a"
+                                                  'name first-instance)
+                                    #:context loc
+                                    #:instance first-instance
+                                    #:run name)
+               (close-run name))]
 
           [else (raise-forge-error                 
                  #:msg (format "Illegal argument to test. Received ~a, expected sat, unsat, or theorem."
@@ -638,21 +635,27 @@
             #:context #,(build-source-location stx)))
          #,(syntax/loc stx (run name #:preds [pred] #:bounds [bounds ...]))
          (define first-instance (tree:get-value (Run-result name)))
-         (when (Unsat? first-instance)
-           #,(syntax/loc stx (run double-check-name #:preds [] #:bounds [bounds ...]))
-           (define double-check-instance (tree:get-value (Run-result double-check-name)))
-           (if (Sat? double-check-instance)
-               (report-test-failure #:name 'name #:msg (format "Invalid example '~a'; the instance specified does not satisfy the given predicate." 'name)
-                                  #:context #,(build-source-location stx)
-                                  #:instance first-instance
-                                  #:run name)
-               (report-test-failure #:name 'name #:msg (format (string-append "Invalid example '~a'; the instance specified is impossible. "
+         (cond
+           [(Unsat? first-instance)
+            ; Run a second check to see if {} would have also failed, meaning this example
+            ; violates the sig/field declarations.
+            #,(syntax/loc stx (run double-check-name #:preds [] #:bounds [bounds ...]))
+            (define double-check-instance (tree:get-value (Run-result double-check-name)))
+            (close-run double-check-name) ;; always close the double-check run immediately
+            
+            (if (Sat? double-check-instance)
+                (report-test-failure #:name 'name #:msg (format "Invalid example '~a'; the instance specified does not satisfy the given predicate." 'name)
+                                     #:context #,(build-source-location stx)
+                                     #:instance first-instance
+                                     #:run name)
+                (report-test-failure #:name 'name #:msg (format (string-append "Invalid example '~a'; the instance specified is impossible. "
                                                                "This means that the specified bounds conflict with each other "
-                                                               "or with the sig/relation definitions.")
-                                                'name)
-                                  #:context #,(build-source-location stx)
-                                  #:instance first-instance
-                                  #:run name)))))))]))
+                                                               "or with the sig/field definitions.")
+                                                                'name)
+                                     #:context #,(build-source-location stx)
+                                     #:instance first-instance
+                                     #:run name))]
+           [else (close-run name)])))))]))
 
 ; Checks that some predicates are always true.
 ; (check name
@@ -702,7 +705,7 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; make-model-generator :: Stream<model> -> (-> model)
-; Creates a thunk which generates a new model on each call.
+; Creates a thunk which generates a new instance on each call.
 (define (make-model-generator model-lazy-tree [mode 'next])
   (thunk
     (define ret (tree:get-value model-lazy-tree))
@@ -981,28 +984,44 @@
 ;; Infrastructure for handling multiple test failures / multiple runs
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+; Struct to hold test-failure information for eventual reporting
 (struct test-failure (name msg context instance run))
-
-(define delayed-test-failures null)                                                      
-
+; Mutable value to store list of test-failure structs
+(define delayed-test-failures null)
+; Called to clear the mutable list
 (define (reset-test-failures!) (set! delayed-test-failures null))
-(define delay-test-failure-reporting? (make-parameter #t))
 
 ; Record (or report, depending on the value of the delay-test-failure-reporting?
 ; parameter) a test failure. 
 (define (report-test-failure #:name name #:msg msg #:context context #:instance [instance #f] #:run run)
-  (cond [(delay-test-failure-reporting?)
-         (printf "Test ~a failed. Continuing to run, and will report details at the end.~n" name)
+  ; Default is to not delay, but options may affect this.
+  (cond [(not (equal? (get-option run 'test_keep) 'first))
+         (unless (equal? (get-verbosity) 0)
+           (printf "Test ~a failed. Continuing to run and will report details at the end.~n" name))
+         ; close previous failure run, since we are keeping only the final failure for Sterling
+         (unless (empty? delayed-test-failures)
+           (close-run (test-failure-run (first delayed-test-failures))))
+         ; then add this failure to the queue
          (set! delayed-test-failures (cons (test-failure name msg context instance run)
                                            delayed-test-failures))]
+        
         [else
+         ; Raise a Forge error and stop execution; show Sterling if enabled.
+         (when (>= (get-verbosity) 1)
+           (printf "Test ~a failed. Opening Sterling (if able) and stopping.~n" name))
+         (true-display run)
          (raise-forge-error #:msg msg #:context context)]))
 
+; To be run at the very end of the Forge execution; reports test failures and opens
+; only one failure (as only one solver will be kept open, at the moment)
 (define (output-all-test-failures)
   ; In order, for each failure:  
   (define failures (remove-duplicates (reverse delayed-test-failures)))
-  (unless (empty? failures)
-    (printf "~nSome tests failed. Reporting failures in order.~n~n" ))
+  (unless (or (< (get-verbosity) 1) (empty? failures))
+    (printf "~nSome tests failed. Reporting failures in order:~n~n" ))
+
+  (define last-failure (if (empty? failures) #f (last failures)))
+  
   (for ([failure failures])
     (define-values (name msg context instance run)
       (values (test-failure-name failure)    (test-failure-msg failure)
@@ -1012,12 +1031,16 @@
     ; Print the error (don't raise an actual exception)
     (define sterling-or-instance (if (equal? (get-option run 'run_sterling) 'off)
                                      (format "Sterling disabled, so reporting raw instance data:~n~a" instance)
-                                     "Running Sterling to show instance (or unsat) generated..."))
+                                     (format "Running Sterling to show instance generated, if any.~n~a"
+                                             (if (equal? failure last-failure)
+                                                 "Solver is active; evaluator and next are available."
+                                                 "For all but the final test failure, the solver was closed to save memory; evaluator and next are unavailable."))))
     (raise-forge-error #:msg (format "~a ~a~n~n" msg sterling-or-instance)
                        #:context context
                        #:raise? #f)
     ; Display in Sterling (if run_sterling is enabled)
-    (true-display run))
+    (when (equal? failure last-failure)
+      (true-display run)))
   
   ; Return to empty-failure-list state.
   (reset-test-failures!))
diff --git a/forge/tests/error/bsl_multiple_failures.frg b/forge/tests/error/bsl_multiple_failures.frg
index b7f214ac..cf2e5059 100644
--- a/forge/tests/error/bsl_multiple_failures.frg
+++ b/forge/tests/error/bsl_multiple_failures.frg
@@ -1,6 +1,6 @@
 #lang forge/bsl
 option run_sterling off
-
+option test_keep last
 option verbose 0
 
 sig Node {
diff --git a/forge/tests/error/multiple-positive-examples-failing.frg b/forge/tests/error/multiple-positive-examples-failing.frg
index a5abb2a8..bf327ad2 100644
--- a/forge/tests/error/multiple-positive-examples-failing.frg
+++ b/forge/tests/error/multiple-positive-examples-failing.frg
@@ -1,5 +1,6 @@
 #lang forge
 option run_sterling off
+option test_keep last
 option verbose 0 
 
 -- Regression test to confirm that the "double-check" done in example tests 
diff --git a/forge/tests/error/multiple_test_failures.frg b/forge/tests/error/multiple_test_failures.frg
index 62dfc52f..b4fe517c 100644
--- a/forge/tests/error/multiple_test_failures.frg
+++ b/forge/tests/error/multiple_test_failures.frg
@@ -1,6 +1,7 @@
 #lang forge
 
 option run_sterling off
+option test_keep last
 option verbose 0
 
 
diff --git a/forge/tests/error/properties_undirected_tree_underconstraint_multiple_errors.frg b/forge/tests/error/properties_undirected_tree_underconstraint_multiple_errors.frg
index 3dfc1cf0..240dddaf 100644
--- a/forge/tests/error/properties_undirected_tree_underconstraint_multiple_errors.frg
+++ b/forge/tests/error/properties_undirected_tree_underconstraint_multiple_errors.frg
@@ -1,5 +1,6 @@
 #lang forge
 option run_sterling off
+option test_keep last
 option verbose 0
 
 sig Node {edges: set Node}
diff --git a/forge/tests/error/unstated_bounds.frg b/forge/tests/error/unstated_bounds.frg
new file mode 100644
index 00000000..1356b6dd
--- /dev/null
+++ b/forge/tests/error/unstated_bounds.frg
@@ -0,0 +1,11 @@
+#lang forge 
+
+// Detect when sufficient bounds have not been provided. 
+
+sig Match {}
+sig B, C extends Match {} 
+
+test expect {
+    should_error: {} for exactly 4 B, 3 C is sat
+}
+
diff --git a/forge/tests/forge-core/other/test_behavior_first.rkt b/forge/tests/forge-core/other/test_behavior_first.rkt
new file mode 100644
index 00000000..bd68ba92
--- /dev/null
+++ b/forge/tests/forge-core/other/test_behavior_first.rkt
@@ -0,0 +1,42 @@
+#lang forge/core
+
+; Confirm that test runs are closed or kept open as expected. 
+; (This is related to multiple_runs.rkt, but has a different focus.)
+
+(require (only-in rackunit check-eq? check-not-eq? check-exn check-not-exn check-true))
+(set-option! 'verbose 0)
+(set-option! 'run_sterling 'off)
+
+; 'first is the default, so no need to run this line:
+;(set-option! 'test_keep 'first) 
+
+(sig A)
+(sig B)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Default option: 'test_keep is 'first
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This test should fail and produce an exception immediately in 'first mode: 
+(check-exn #rx"Failed test sat-run-A."
+           (lambda () (test sat-run-A
+                            #:preds [(some A) (no B)]
+                            #:expect unsat)))
+
+; This is not bound to an accessible identifier, but it _is_ kept in the state:
+(define local-sat-run-A (hash-ref (forge:State-runmap forge:curr-state) 'sat-run-A))
+; The test run should remain open, so that Sterling's evaluator is available, etc.
+; DO NOT USE is-running?; that is for the solver process as a whole.
+(check-true (forge:is-sat? local-sat-run-A))
+(check-true (not (forge:is-run-closed? local-sat-run-A)))
+
+;;;;;;;;;;;;;;;;;;;;;
+
+; This test should pass, and its run should be closed immediately.
+(check-not-exn 
+ (lambda () (test sat-run-B
+                  #:preds [(some A) (no B)]
+                  #:expect sat)))
+(define local-sat-run-B (hash-ref (forge:State-runmap forge:curr-state) 'sat-run-B))
+(check-true (forge:is-sat? local-sat-run-B))
+(check-true (forge:is-run-closed? local-sat-run-B))
diff --git a/forge/tests/forge-core/other/test_behavior_last.rkt b/forge/tests/forge-core/other/test_behavior_last.rkt
new file mode 100644
index 00000000..ee593c08
--- /dev/null
+++ b/forge/tests/forge-core/other/test_behavior_last.rkt
@@ -0,0 +1,44 @@
+#lang forge/core
+
+; Confirm that test runs are closed or kept open as expected. 
+; (This is related to multiple_runs.rkt, but has a different focus.)
+
+(require (only-in rackunit check-eq? check-not-eq? check-exn check-not-exn check-true))
+(set-option! 'verbose 0)
+(set-option! 'run_sterling 'off)
+
+; Not the default:
+(set-option! 'test_keep 'last) 
+
+(sig A)
+(sig B)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Modified option: 'test_keep is 'last
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This test should fail, but not immediately produce an exception:
+(check-not-exn
+ (lambda () (test sat-run-A
+                  #:preds [(some A) (no B)]
+                  #:expect unsat)))
+; Same with this test. 
+(check-not-exn
+ (lambda () (test sat-run-B
+                  #:preds [(some A) (no B)]
+                  #:expect unsat)))
+
+; This is not bound to an accessible identifier, but it _is_ kept in the state:
+(define local-sat-run-A (hash-ref (forge:State-runmap forge:curr-state) 'sat-run-A))
+(define local-sat-run-B (hash-ref (forge:State-runmap forge:curr-state) 'sat-run-B))
+; Under the 'last option, the first run should have been closed
+; DO NOT USE is-running?; that is for the solver process as a whole.
+; Under the 'last option, the second run should remain open (since it is the last failure)
+(check-true (forge:is-sat? local-sat-run-A))
+(check-true (forge:is-run-closed? local-sat-run-A)) 
+(check-true (forge:is-sat? local-sat-run-B))
+(check-true (not (forge:is-run-closed? local-sat-run-B)))
+
+; This is, however, not calling the failure-reporter that the Forge surface languages
+; invoke at the end of their execution -- since this is forge/core.
+; (output-all-test-failures)