From a8cf08d9e6db3d3e6ec457505f25f4063b0dacb4 Mon Sep 17 00:00:00 2001 From: Anselm Kruis Date: Sun, 11 Nov 2018 17:36:57 +0100 Subject: [PATCH] Stackless issue #181: Replace slp_dont_optimize... vars by more appropriate compiler specific code. The new code should work with whole program optimisation. (cherry picked from commit 464ef17bcbf634ae33b37672a5b803f0ce940086) --- Python/ceval.c | 12 +++---- Stackless/core/slp_transfer.c | 15 ++++----- Stackless/core/stacklesseval.c | 9 +++-- Stackless/module/scheduling.c | 3 +- Stackless/platf/slp_platformselect.h | 49 ++++++++++++++++++++++++++-- 5 files changed, 66 insertions(+), 22 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 21e0c5ed37ac4f..fc9478e5ecbab4 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -4048,9 +4048,6 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) #ifdef STACKLESS -/* a global write only dummy variable */ -char _dont_optimise_away_slp_eval_frame_functions; - PyObject * slp_eval_frame_noval(PyFrameObject *f, int throwflag, PyObject *retval) { @@ -4060,7 +4057,8 @@ slp_eval_frame_noval(PyFrameObject *f, int throwflag, PyObject *retval) * it serves as a marker whether we expect a value or * not, and it makes debugging a little easier. */ - _dont_optimise_away_slp_eval_frame_functions = 1; + SLP_DO_NOT_OPTIMIZE_AWAY((char *)1); + r = slp_eval_frame_value(f, throwflag, retval); return r; } @@ -4075,7 +4073,7 @@ slp_eval_frame_iter(PyFrameObject *f, int throwflag, PyObject *retval) * for_iter operation. In this case we need to handle * null without error as valid result. */ - _dont_optimise_away_slp_eval_frame_functions = 2; + SLP_DO_NOT_OPTIMIZE_AWAY((char *)2); r = slp_eval_frame_value(f, throwflag, retval); return r; } @@ -4090,7 +4088,7 @@ slp_eval_frame_setup_with(PyFrameObject *f, int throwflag, PyObject *retval) * SETUP_WITH operation. * NOTE / XXX: see above. */ - _dont_optimise_away_slp_eval_frame_functions = 3; + SLP_DO_NOT_OPTIMIZE_AWAY((char *)3); r = slp_eval_frame_value(f, throwflag, retval); return r; } @@ -4105,7 +4103,7 @@ slp_eval_frame_with_cleanup(PyFrameObject *f, int throwflag, PyObject *retval) * WITH_CLEANUP operation. * NOTE / XXX: see above. */ - _dont_optimise_away_slp_eval_frame_functions = 4; + SLP_DO_NOT_OPTIMIZE_AWAY((char *)4); r = slp_eval_frame_value(f, throwflag, retval); return r; } diff --git a/Stackless/core/slp_transfer.c b/Stackless/core/slp_transfer.c index b0d050bbb47f98..6564efd1f4383f 100644 --- a/Stackless/core/slp_transfer.c +++ b/Stackless/core/slp_transfer.c @@ -45,6 +45,8 @@ static PyTaskletObject *_prev; #define SLP_EVAL #include "platf/slp_platformselect.h" +SLP_DO_NOT_OPTIMIZE_AWAY_DEFINITIONS + #ifdef EXTERNAL_ASM /* CCP addition: Make these functions, to be called from assembler. * The token include file for the given platform should enable the @@ -80,8 +82,6 @@ extern int slp_switch(void); #endif -/* a write only variable used to prevent overly optimisation */ -intptr_t *global_goobledigoobs; static int climb_stack_and_transfer(PyCStackObject **cstprev, PyCStackObject *cst, PyTaskletObject *prev) @@ -96,15 +96,14 @@ climb_stack_and_transfer(PyCStackObject **cstprev, PyCStackObject *cst, intptr_t probe; register ptrdiff_t needed = &probe - ts->st.cstack_base; /* in rare cases, the need might have vanished due to the recursion */ - register intptr_t *goobledigoobs; if (needed > 0) { - goobledigoobs = alloca(needed * sizeof(intptr_t)); - if (goobledigoobs == NULL) + register void * stack_ptr_tmp = alloca(needed * sizeof(intptr_t)); + if (stack_ptr_tmp == NULL) return -1; - /* hinder the compiler to optimise away - goobledigoobs and the alloca call. + /* hinder the compiler to optimise away + stack_ptr_tmp and the alloca call. This happens with gcc 4.7.x and -O2 */ - global_goobledigoobs = goobledigoobs; + SLP_DO_NOT_OPTIMIZE_AWAY(stack_ptr_tmp); } return slp_transfer(cstprev, cst, prev); } diff --git a/Stackless/core/stacklesseval.c b/Stackless/core/stacklesseval.c index 1a2af98a81d68c..87ce00d787fb39 100644 --- a/Stackless/core/stacklesseval.c +++ b/Stackless/core/stacklesseval.c @@ -281,11 +281,14 @@ climb_stack_and_eval_frame(PyFrameObject *f) intptr_t probe; ptrdiff_t needed = &probe - ts->st.cstack_base; /* in rare cases, the need might have vanished due to the recursion */ - intptr_t *goobledigoobs; if (needed > 0) { - goobledigoobs = alloca(needed * sizeof(intptr_t)); - if (goobledigoobs == NULL) + register void * stack_ptr_tmp = alloca(needed * sizeof(intptr_t)); + if (stack_ptr_tmp == NULL) return NULL; + /* hinder the compiler to optimise away + stack_ptr_tmp and the alloca call. + This happens with gcc 4.7.x and -O2 */ + SLP_DO_NOT_OPTIMIZE_AWAY(stack_ptr_tmp); } return slp_eval_frame(f); } diff --git a/Stackless/module/scheduling.c b/Stackless/module/scheduling.c index 2d80302cbea2d9..5141f40ec0ff25 100644 --- a/Stackless/module/scheduling.c +++ b/Stackless/module/scheduling.c @@ -361,7 +361,6 @@ typedef struct { /* not a valid ptr and not a common integer */ #define SAVED_TSTATE_MAGIC1 (((intptr_t)transfer_with_exc)+1) #define SAVED_TSTATE_MAGIC2 (-1*((intptr_t)transfer_with_exc)) -saved_tstat_with_magic_t * _dont_optimise_away_saved_tstat_with_magic; static int transfer_with_exc(PyCStackObject **cstprev, PyCStackObject *cst, PyTaskletObject *prev) @@ -376,7 +375,7 @@ transfer_with_exc(PyCStackObject **cstprev, PyCStackObject *cst, PyTaskletObject /* prevent overly compiler optimisation. We store the address of sm into a global variable. This way the optimizer can't change the layout of the structure. */ - _dont_optimise_away_saved_tstat_with_magic = &sm; + SLP_DO_NOT_OPTIMIZE_AWAY(&sm); sm.s.tracing = ts->tracing; sm.s.c_profilefunc = ts->c_profilefunc; diff --git a/Stackless/platf/slp_platformselect.h b/Stackless/platf/slp_platformselect.h index 0f90dfdf0ded77..dc5921015c8b65 100644 --- a/Stackless/platf/slp_platformselect.h +++ b/Stackless/platf/slp_platformselect.h @@ -17,9 +17,9 @@ #elif defined(__GNUC__) && defined(sparc) && defined(sun) #include "switch_sparc_sun_gcc.h" /* SunOS sparc with gcc */ #elif defined(__GNUC__) && defined(__s390__) && defined(__linux__) -#include "switch_s390_unix.h" /* Linux/S390 */ +#include "switch_s390_unix.h" /* Linux/S390 */ #elif defined(__GNUC__) && defined(__s390x__) && defined(__linux__) -#include "switch_s390_unix.h" /* Linux/S390 zSeries (identical) */ +#include "switch_s390_unix.h" /* Linux/S390 zSeries (identical) */ #elif defined(__GNUC__) && defined(__arm__) && defined(__thumb__) #include "switch_arm_thumb_gcc.h" /* gcc using arm thumb */ #elif defined(__GNUC__) && defined(__arm32__) @@ -32,6 +32,51 @@ /* default definitions if not defined in above files */ +/* + * Call SLP_DO_NOT_OPTIMIZE_AWAY(pointer) to ensure that pointer will be + * computed even post-optimization. Use it for pointers that are computed but + * otherwise are useless. The compiler tends to do a good job at eliminating + * unused variables, and this macro fools it into thinking var is in fact + * needed. + */ + +#ifndef SLP_DO_NOT_OPTIMIZE_AWAY + +/* Code is based on Facebook folly + * https://github.com/facebook/folly/blob/master/folly/Benchmark.h, + * which has an Apache 2 license. + */ +#ifdef _MSC_VER + +#pragma optimize("", off) + +static inline void doNotOptimizeDependencySink(const void* p) {} + +#pragma optimize("", on) + +#define SLP_DO_NOT_OPTIMIZE_AWAY(pointer) doNotOptimizeDependencySink(pointer) +#define SLP_DO_NOT_OPTIMIZE_AWAY_DEFINITIONS /* empty */ + +#elif (defined(__GNUC__) || defined(__clang__)) +/* + * The "r" constraint forces the compiler to make datum available + * in a register to the asm block, which means that it must have + * computed/loaded it. + */ +#define SLP_DO_NOT_OPTIMIZE_AWAY(pointer) \ + do {__asm__ volatile("" ::"r"(pointer));} while(0) +#define SLP_DO_NOT_OPTIMIZE_AWAY_DEFINITIONS /* empty */ +#else +/* + * Unknown compiler + */ +#define SLP_DO_NOT_OPTIMIZE_AWAY(pointer) \ + do { slp_do_not_opimize_away_sink = ((void*)(pointer)); } while(0) +extern uint8_t* volatile slp_do_not_opimize_away_sink; +#define SLP_DO_NOT_OPTIMIZE_AWAY_DEFINITIONS uint8_t* volatile slp_do_not_opimize_away_sink; +#endif +#endif + /* adjust slots to typical size of a few recursions on your system */ #ifndef CSTACK_SLOTS