diff --git a/pySDC/projects/Resilience/fault_stats.py b/pySDC/projects/Resilience/fault_stats.py index 6bb22c9827..b75e47ded8 100644 --- a/pySDC/projects/Resilience/fault_stats.py +++ b/pySDC/projects/Resilience/fault_stats.py @@ -271,8 +271,10 @@ def generate_stats(self, strategy=None, runs=1000, reload=True, faults=True, com dat['bit'][i] = faults_run[0][1][4] dat['target'][i] = faults_run[0][1][5] dat['rank'][i] = faults_run[0][1][6] + if crash: print('Code crashed!') + dat['error'][i] = np.inf continue # record the rest of the data @@ -876,7 +878,7 @@ def plot_things_per_things( args=None, strategies=None, name=None, - store=True, + store=False, ax=None, fig=None, plotting_args=None, @@ -933,7 +935,7 @@ def plot_things_per_things( return None def plot_recovery_thresholds( - self, strategies=None, thresh_range=None, ax=None, mask=None, **kwargs + self, strategies=None, thresh_range=None, ax=None, recoverable_only=False, **kwargs ): # pragma: no cover ''' Plot the recovery rate for a range of thresholds @@ -942,7 +944,6 @@ def plot_recovery_thresholds( strategies (list): List of the strategies you want to plot, if None, all will be plotted thresh_range (list): thresholds for deciding whether to accept as recovered ax (Matplotlib.axes): Somewhere to plot - mask (Numpy.ndarray of shape (n)): The mask you want to know about Returns: None @@ -961,16 +962,21 @@ def plot_recovery_thresholds( fault_free = self.load(strategy=strategy, faults=False) with_faults = self.load(strategy=strategy, faults=True) + if recoverable_only: + recoverable_mask = self.get_fixable_faults_only(strategy) + else: + recoverable_mask = self.get_mask() + for thresh_idx in range(len(thresh_range)): rec_mask = self.get_mask( strategy=strategy, key='error', val=(thresh_range[thresh_idx] * fault_free['error'].mean()), op='gt', - old_mask=mask, + old_mask=recoverable_mask, ) rec_rates[strategy_idx][thresh_idx] = 1.0 - len(with_faults['error'][rec_mask]) / len( - with_faults['error'] + with_faults['error'][recoverable_mask] ) ax.plot( @@ -1689,7 +1695,7 @@ def main(): stats_path='data/stats-jusuf', **kwargs, ) - stats_analyser.run_stats_generation(runs=kwargs['runs'], step=12) + stats_analyser.run_stats_generation(runs=kwargs['runs'], step=25) if MPI.COMM_WORLD.rank > 0: # make sure only one rank accesses the data return None diff --git a/pySDC/projects/Resilience/paper_plots.py b/pySDC/projects/Resilience/paper_plots.py index 04bea2d954..67f9aca486 100644 --- a/pySDC/projects/Resilience/paper_plots.py +++ b/pySDC/projects/Resilience/paper_plots.py @@ -20,6 +20,7 @@ DIRKStrategy, ERKStrategy, AdaptivityPolynomialError, + cmap, ) from pySDC.helpers.plot_helper import setup_mpl, figsize_by_journal from pySDC.helpers.stats_helper import get_sorted @@ -392,6 +393,102 @@ def plot_fault_vdp(bit=0): # pragma: no cover savefig(fig, f'fault_bit_{bit}') +def plot_fault_Lorenz(bit=0): # pragma: no cover + """ + Make a plot showing the impact of a fault on the Lorenz attractor without any resilience. + The faults are inserted in the last iteration in the last node in x such that you can best see the impact. + + Args: + bit (int): The bit that you want to flip + + Returns: + None + """ + from pySDC.projects.Resilience.fault_stats import ( + FaultStats, + BaseStrategy, + ) + from pySDC.projects.Resilience.hook import LogData + + stats_analyser = FaultStats( + prob=run_Lorenz, + strategies=[BaseStrategy()], + faults=[False, True], + reload=True, + recovery_thresh=1.1, + num_procs=1, + mode='combination', + ) + + strategy = BaseStrategy() + + my_setup_mpl() + fig, ax = plt.subplots(figsize=figsize_by_journal(JOURNAL, 0.8, 0.5)) + colors = ['grey', strategy.color, 'magenta'] + ls = ['--', '-'] + markers = [None, strategy.marker] + do_faults = [False, True] + superscripts = ['*', ''] + labels = ['x', 'x'] + + run = 19 + 20 * bit + + for i in range(len(do_faults)): + stats, controller, Tend = stats_analyser.single_run( + strategy=BaseStrategy(), + run=run, + faults=do_faults[i], + hook_class=[LogData], + ) + u = get_sorted(stats, type='u') + faults = get_sorted(stats, type='bitflip') + ax.plot( + [me[0] for me in u], + [me[1][0] for me in u], + ls=ls[i], + color=colors[i], + label=rf'${{{labels[i]}}}^{{{superscripts[i]}}}$', + marker=markers[i], + markevery=500, + ) + for idx in range(len(faults)): + ax.axvline(faults[idx][0], color='black', label='Fault', ls=':') + print( + f'Fault at t={faults[idx][0]:.2e}, iter={faults[idx][1][1]}, node={faults[idx][1][2]}, space={faults[idx][1][3]}, bit={faults[idx][1][4]}' + ) + ax.set_title(f'Fault in bit {faults[idx][1][4]}') + + ax.legend(frameon=True, loc='lower left') + ax.set_xlabel(r'$t$') + savefig(fig, f'fault_bit_{bit}') + + +def plot_Lorenz_solution(): # pragma: no cover + my_setup_mpl() + + fig, axs = plt.subplots(1, 2, figsize=figsize_by_journal(JOURNAL, 1, 0.4), sharex=True) + + strategy = BaseStrategy() + desc = strategy.get_custom_description(run_Lorenz, num_procs=1) + stats, controller, _ = run_Lorenz(custom_description=desc, Tend=strategy.get_Tend(run_Lorenz)) + + u = get_sorted(stats, recomputed=False, type='u') + + axs[0].plot([me[1][0] for me in u], [me[1][2] for me in u]) + axs[0].set_ylabel('$z$') + axs[0].set_xlabel('$x$') + + axs[1].plot([me[1][0] for me in u], [me[1][1] for me in u]) + axs[1].set_ylabel('$y$') + axs[1].set_xlabel('$x$') + + for ax in axs: + ax.set_box_aspect(1.0) + + path = 'data/paper/Lorenz_sol.pdf' + fig.savefig(path, bbox_inches='tight', transparent=True, dpi=200) + + def plot_quench_solution(): # pragma: no cover """ Plot the solution of Quench problem over time @@ -586,6 +683,12 @@ def work_precision(): # pragma: no cover all_problems(**{**all_params, 'work_key': 'param'}, mode='compare_strategies') +def plot_recovery_rate_per_acceptance_threshold(problem): # pragma no cover + stats_analyser = get_stats(problem) + + stats_analyser.plot_recovery_thresholds(thresh_range=np.linspace(0.5, 1.5, 1000), recoverable_only=True) + + def make_plots_for_TIME_X_website(): # pragma: no cover global JOURNAL, BASE_PATH JOURNAL = 'JSC_beamer' @@ -636,11 +739,14 @@ def make_plots_for_adaptivity_paper(): # pragma: no cover def make_plots_for_resilience_paper(): # pragma: no cover - compare_recovery_rate_problems(target='resilience', num_procs=1, strategy_type='SDC') + plot_Lorenz_solution() + plot_fault_Lorenz(0) + plot_fault_Lorenz(20) plot_RBC_solution() - plot_recovery_rate(get_stats(run_vdp)) - plot_fault_vdp(0) - plot_fault_vdp(13) + compare_recovery_rate_problems(target='resilience', num_procs=1, strategy_type='SDC') + # plot_recovery_rate(get_stats(run_Lorenz)) + # plot_recovery_rate_per_acceptance_threshold(run_Lorenz) + plt.show() def make_plots_for_notes(): # pragma: no cover diff --git a/pySDC/projects/Resilience/strategies.py b/pySDC/projects/Resilience/strategies.py index 09bcbff62d..406e63995d 100644 --- a/pySDC/projects/Resilience/strategies.py +++ b/pySDC/projects/Resilience/strategies.py @@ -127,7 +127,7 @@ def get_fault_args(self, problem, num_procs): elif problem.__name__ == "run_quench": args['time'] = 41.0 elif problem.__name__ == "run_Lorenz": - args['time'] = 0.3 + args['time'] = 10 elif problem.__name__ == "run_AC": args['time'] = 1e-2 elif problem.__name__ == "run_RBC": @@ -204,7 +204,7 @@ def get_Tend(cls, problem, num_procs=1): elif problem.__name__ == "run_piline": return 20.0 elif problem.__name__ == "run_Lorenz": - return 1.5 + return 20 elif problem.__name__ == "run_Schroedinger": return 1.0 elif problem.__name__ == "run_quench": @@ -247,7 +247,7 @@ def get_base_parameters(self, problem, num_procs=1): elif problem.__name__ == "run_Lorenz": custom_description['step_params'] = {'maxiter': 5} - custom_description['level_params'] = {'dt': 1e-2} + custom_description['level_params'] = {'dt': 1e-3} custom_description['problem_params'] = {'stop_at_nan': False} elif problem.__name__ == "run_Schroedinger": custom_description['step_params'] = {'maxiter': 5} @@ -293,7 +293,7 @@ def get_base_parameters(self, problem, num_procs=1): max_runtime = { 'run_vdp': 1000, - 'run_Lorenz': 60, + 'run_Lorenz': 500, 'run_Schroedinger': 150, 'run_quench': 150, 'run_AC': 150, @@ -445,6 +445,8 @@ def get_custom_description_for_faults(self, problem, *args, **kwargs): desc = self.get_custom_description(problem, *args, **kwargs) if problem.__name__ == "run_quench": desc['level_params']['dt'] = 5.0 + elif problem.__name__ == "run_AC": + desc['level_params']['dt'] = 8e-5 return desc def get_reference_value(self, problem, key, op, num_procs=1): @@ -462,9 +464,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 2136 + return 12350 elif key == 'e_global_post_run' and op == max: - return 9.256926357892326e-06 + return 1.3527453646133836e-07 super().get_reference_value(problem, key, op, num_procs) @@ -539,7 +541,7 @@ def get_custom_description(self, problem, num_procs): elif problem.__name__ == "run_vdp": e_tol = 2e-5 elif problem.__name__ == "run_Lorenz": - e_tol = 2e-5 + e_tol = 1e-7 elif problem.__name__ == "run_Schroedinger": e_tol = 4e-7 elif problem.__name__ == "run_quench": @@ -592,9 +594,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == 'run_Lorenz': if key == 'work_newton' and op == sum: - return 1369 + return 2989 elif key == 'e_global_post_run' and op == max: - return 9.364841517367495e-06 + return 5.636767497207984e-08 super().get_reference_value(problem, key, op, num_procs) @@ -606,6 +608,8 @@ def get_custom_description_for_faults(self, problem, num_procs, *args, **kwargs) if problem.__name__ == "run_quench": desc['level_params']['dt'] = 1.1e1 desc['convergence_controllers'][Adaptivity]['e_tol'] = 1e-6 + elif problem.__name__ == "run_AC": + desc['convergence_controllers'][Adaptivity]['e_tol'] = 1e-5 return desc @@ -717,7 +721,7 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 4758 + return 5092 elif key == 'e_global_post_run' and op == max: return 4.107116318152748e-06 @@ -822,9 +826,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 1872 + return 9200 elif key == 'e_global_post_run' and op == max: - return 2.2362043480939064e-05 + return 2.139863344829962e-05 super().get_reference_value(problem, key, op, num_procs) @@ -855,7 +859,7 @@ def get_custom_description_for_faults(self, problem, *args, **kwargs): if problem.__name__ == 'run_quench': desc['level_params']['dt'] = 5.0 elif problem.__name__ == 'run_AC': - desc['level_params']['dt'] = 0.6 * desc['problem_params']['eps'] ** 2 + desc['level_params']['dt'] = 5e-4 elif problem.__name__ == 'run_RBC': desc['level_params']['restol'] = 1e-6 return desc @@ -875,9 +879,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 2392 + return 12350 elif key == 'e_global_post_run' and op == max: - return 4.808610118089973e-06 + return 1.3527453646133836e-07 super().get_reference_value(problem, key, op, num_procs) @@ -975,7 +979,7 @@ def get_custom_description(self, problem, num_procs): 'level_params': {}, } if problem.__name__ == "run_AC": - custom_description['level_params']['dt'] = 0.8 * base_params['problem_params']['eps'] ** 2 / 8.0 + custom_description['level_params']['dt'] = 8e-5 return merge_descriptions(base_params, custom_description) def get_custom_description_for_faults(self, problem, *args, **kwargs): @@ -999,9 +1003,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 2329 + return 12350 elif key == 'e_global_post_run' and op == max: - return 9.256926357892326e-06 + return 1.3527453646133836e-07 super().get_reference_value(problem, key, op, num_procs) @@ -1115,9 +1119,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 983 + return 1025 elif key == 'e_global_post_run' and op == max: - return 3.944880392126038e-06 + return 4.266975256683736e-06 super().get_reference_value(problem, key, op, num_procs) @@ -1154,7 +1158,7 @@ def get_reference_value(self, problem, key, op, num_procs=1): if key == 'work_newton' and op == sum: return 917 elif key == 'e_global_post_run' and op == max: - return 1.0587702028885815e-05 + return 1.0874929465387595e-05 super().get_reference_value(problem, key, op, num_procs) @@ -1185,9 +1189,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == 'run_Lorenz': if key == 'work_newton' and op == sum: - return 1358 + return 1338 elif key == 'e_global_post_run' and op == max: - return 0.00010316526647002888 + return 0.0001013999955041811 super().get_reference_value(problem, key, op, num_procs) @@ -1268,9 +1272,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 1456 + return 5467 elif key == 'e_global_post_run' and op == max: - return 0.00013730538358736055 + return 7.049480537091313e-07 super().get_reference_value(problem, key, op, num_procs) @@ -1463,9 +1467,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 820 + return 2963 elif key == 'e_global_post_run' and op == max: - return 3.148061889390874e-06 + return 4.126039954144289e-09 super().get_reference_value(problem, key, op, num_procs) @@ -1556,7 +1560,7 @@ def get_reference_value(self, problem, key, op, num_procs=1): if key == 'work_newton' and op == sum: return 0 elif key == 'e_global_post_run' and op == max: - return 3.5085474063834e-05 + return 1.509206128957885e-07 super().get_reference_value(problem, key, op, num_procs) @@ -1633,9 +1637,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == 'run_Lorenz': if key == 'work_newton' and op == sum: - return 1369 + return 2989 elif key == 'e_global_post_run' and op == max: - return 9.364841517367495e-06 + return 5.636763944494305e-08 super().get_reference_value(problem, key, op, num_procs) @@ -1688,9 +1692,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 1369 + return 2989 elif key == 'e_global_post_run' and op == max: - return 9.364841517367495e-06 + return 5.636763944494305e-08 super().get_reference_value(problem, key, op, num_procs) @@ -1912,7 +1916,7 @@ def get_custom_description(self, problem, num_procs): elif problem.__name__ == "run_piline": e_tol = 1e-7 elif problem.__name__ == "run_Lorenz": - e_tol = 7e-4 + e_tol = 2e-4 elif problem.__name__ == "run_Schroedinger": e_tol = 3e-5 elif problem.__name__ == "run_quench": @@ -2007,9 +2011,9 @@ def get_reference_value(self, problem, key, op, num_procs=1): """ if problem.__name__ == "run_Lorenz": if key == 'work_newton' and op == sum: - return 2124 + return 2123 elif key == 'e_global_post_run' and op == max: - return 8.484321512014503e-08 + return 7.931560830343187e-08 super().get_reference_value(problem, key, op, num_procs) diff --git a/pySDC/projects/Resilience/tests/test_fault_injection.py b/pySDC/projects/Resilience/tests/test_fault_injection.py index c44ea38307..d23b5eccfc 100644 --- a/pySDC/projects/Resilience/tests/test_fault_injection.py +++ b/pySDC/projects/Resilience/tests/test_fault_injection.py @@ -153,29 +153,29 @@ def test_fault_injection(): @pytest.mark.mpi4py @pytest.mark.slow -@pytest.mark.parametrize("numprocs", [5]) -def test_fault_stats(numprocs): +@pytest.mark.parametrize('strategy_name', ['adaptivity']) +def test_fault_stats(strategy_name): """ Test generation of fault statistics and their recovery rates """ import numpy as np + from pySDC.projects.Resilience.strategies import ( + BaseStrategy, + AdaptivityStrategy, + kAdaptivityStrategy, + HotRodStrategy, + ) - # Set python path once - my_env = os.environ.copy() - my_env['PYTHONPATH'] = '../../..:.' - my_env['COVERAGE_PROCESS_START'] = 'pyproject.toml' - - cmd = f"mpirun -np {numprocs} python {__file__} --test-fault-stats".split() + strategies = { + 'base': BaseStrategy, + 'adaptivity': AdaptivityStrategy, + 'kAdaptivity': kAdaptivityStrategy, + 'HotRod': HotRodStrategy, + } - p = subprocess.Popen(cmd, env=my_env, cwd=".") + strategy = strategies[strategy_name]() - p.wait() - assert p.returncode == 0, 'ERROR: did not get return code 0, got %s with %2i processes' % ( - p.returncode, - numprocs, - ) - - stats = generate_stats(True) + stats = generate_stats(strategy, True) # test number of possible combinations for faults expected_max_combinations = 3840 @@ -184,35 +184,34 @@ def test_fault_stats(numprocs): ), f"Expected {expected_max_combinations} possible combinations for faults in van der Pol problem, but got {stats.get_max_combinations()}!" recovered_reference = { - 'base': 1, - 'adaptivity': 2, + 'base': 0, + 'adaptivity': 1, 'iterate': 1, - 'Hot Rod': 2, + 'Hot Rod': 1, 'adaptivity_coll': 0, 'double_adaptivity': 0, } stats.get_recovered() - for strategy in stats.strategies: - dat = stats.load(strategy=strategy, faults=True) - fixable_mask = stats.get_fixable_faults_only(strategy) - recovered_mask = stats.get_mask(strategy=strategy, key='recovered', op='eq', val=True) - index = stats.get_index(mask=fixable_mask) + dat = stats.load(strategy=strategy, faults=True) + fixable_mask = stats.get_fixable_faults_only(strategy) + recovered_mask = stats.get_mask(strategy=strategy, key='recovered', op='eq', val=True) + index = stats.get_index(mask=fixable_mask) - assert all(fixable_mask[:-1] == [False, True, False]), "Error in generating mask of fixable faults" - assert all(index == [1, 3]), "Error when converting to index" + assert all(fixable_mask == [False, True]), "Error in generating mask of fixable faults" + assert all(index == [1]), "Error when converting to index" - combinations = np.array(stats.get_combination_counts(dat, keys=['bit'], mask=fixable_mask)) - assert all(combinations == [1.0, 1.0]), "Error when counting combinations" + combinations = np.array(stats.get_combination_counts(dat, keys=['bit'], mask=fixable_mask)) + assert all(combinations == [1.0, 1.0]), "Error when counting combinations" - recovered = len(dat['recovered'][recovered_mask]) - crashed = len(dat['error'][dat['error'] == np.inf]) # on some systems the last run crashes... - assert ( - recovered >= recovered_reference[strategy.name] - crashed - ), f'Expected {recovered_reference[strategy.name]} recovered faults, but got {recovered} recovered faults in {strategy.name} strategy!' + recovered = len(dat['recovered'][recovered_mask]) + crashed = len(dat['error'][dat['error'] == np.inf]) # on some systems the last run crashes... + assert ( + recovered >= recovered_reference[strategy.name] - crashed + ), f'Expected {recovered_reference[strategy.name]} recovered faults, but got {recovered} recovered faults in {strategy.name} strategy!' -def generate_stats(load=False): +def generate_stats(strategy, load=False): """ Generate stats to check the recovery rate @@ -222,12 +221,6 @@ def generate_stats(load=False): Returns: Object containing the stats """ - from pySDC.projects.Resilience.strategies import ( - BaseStrategy, - AdaptivityStrategy, - IterateStrategy, - HotRodStrategy, - ) from pySDC.projects.Resilience.fault_stats import ( FaultStats, ) @@ -242,15 +235,10 @@ def generate_stats(load=False): recovery_thresh=1.1, num_procs=1, mode='random', - strategies=[ - BaseStrategy(), - AdaptivityStrategy(), - IterateStrategy(), - HotRodStrategy(), - ], + strategies=[strategy], stats_path='data', ) - stats.run_stats_generation(runs=4, step=2) + stats.run_stats_generation(runs=2, step=1) return stats diff --git a/pySDC/projects/Resilience/tests/test_strategies.py b/pySDC/projects/Resilience/tests/test_strategies.py index 7e9b0e71a2..b218fc162a 100644 --- a/pySDC/projects/Resilience/tests/test_strategies.py +++ b/pySDC/projects/Resilience/tests/test_strategies.py @@ -17,7 +17,7 @@ 'AdaptivityPolynomialError', 'kAdaptivity', ] -STRATEGY_NAMES_NONMPIONLY = ['adaptiveHR', 'HotRod'] +STRATEGY_NAMES_NONMPIONLY = ['HotRod'] STRATEGY_NAMES_MPIONLY = ['ARK'] LOGGER_LEVEL = 30 @@ -81,6 +81,7 @@ def single_test(strategy_name, useMPI, num_procs): use_MPI=useMPI, custom_controller_params=controller_params, comm=comm, + Tend=1.0, ) # things we want to test