Support resetting of contention metric, fix deadlocking (#3)

- Support method reset_contention_metric() which resets the contention metric and timers. - Fix deadlocking by internally spawning a thread specifically to try and obtain the GIL. This allows the monitoring thread to remain available for receiving messages. Thereby also removing the flaky decorator and xfail markers. Ref dask/distributed#7290 (comment)
milesgranger · Jan 18, 2023 · aed7ec4 · aed7ec4
1 parent 8e107d3
commit aed7ec4
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 41 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gil-knocker"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Miles Granger <miles59923@gmail.com>"]
 license = "MIT"

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,7 @@
 #[deny(missing_docs)]
 use parking_lot::{const_rwlock, RwLock};
 use pyo3::exceptions::PyRuntimeError;
+use pyo3::ffi::{PyEval_InitThreads, PyEval_ThreadsInitialized};
 use pyo3::prelude::*;
 use pyo3::{
  exceptions::{PyBrokenPipeError, PyTimeoutError, PyValueError},
@@ -23,6 +24,12 @@ fn gilknocker(_py: Python, m: &PyModule) -> PyResult<()> {
  Ok(())
 }
 
+/// Possible messages to pass to the monitoring thread.
+enum Message {
+ Stop,
+ Reset,
+}
+
 /// Struct for polling, knocking on the GIL,
 /// checking if it's locked in the current thread
 ///
@@ -40,8 +47,8 @@ fn gilknocker(_py: Python, m: &PyModule) -> PyResult<()> {
 #[derive(Default)]
 pub struct KnockKnock {
  handle: Option<thread::JoinHandle<()>>,
- channel: Option<Sender<bool>>,
- contention_metric: Option<Arc<RwLock<f32>>>,
+ channel: Option<Sender<Message>>,
+ contention_metric: Arc<RwLock<f32>>,
  interval: Duration,
  timeout: Duration,
 }
@@ -72,32 +79,80 @@ impl KnockKnock {
  /// and lower indicates less contention, with 0 theoretically indicating zero
  /// contention.
  #[getter]
- pub fn contention_metric(&self) -> Option<f32> {
- self.contention_metric.as_ref().map(|v| *(*v).read())
+ pub fn contention_metric(&self) -> f32 {
+ *(*self.contention_metric).read()
+ }
+
+ /// Reset the contention metric/monitoring state
+ pub fn reset_contention_metric(&mut self) -> PyResult<()> {
+ match &self.channel {
+ Some(channel) => {
+ channel
+ .send(Message::Reset)
+ .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+
+ // need to wait for thread to catch and process reset
+ while *(*self.contention_metric).read() > 0.005 {
+ thread::sleep(Duration::from_millis(1));
+ }
+ Ok(())
+ }
+ None => Err(PyValueError::new_err(
+ "Does not appear `start` was called, nothing to reset.",
+ )),
+ }
  }
 
  /// Start polling the GIL to check if it's locked.
  pub fn start(&mut self, py: Python) -> () {
  let (send, recv) = channel();
  self.channel = Some(send);
 
+ unsafe {
+ if PyEval_ThreadsInitialized() == 0 {
+ PyEval_InitThreads();
+ }
+ }
+
  let contention_metric = Arc::new(const_rwlock(0_f32));
- self.contention_metric = Some(contention_metric.clone());
+ self.contention_metric = contention_metric.clone();
+
  let interval = self.interval;
  let handle = py.allow_threads(move || {
  thread::spawn(move || {
  let mut time_to_acquire = Duration::from_millis(0);
- let runtime = Instant::now();
- while recv
- .recv_timeout(interval)
- .unwrap_or_else(|e| e != RecvTimeoutError::Disconnected)
- {
- let start = Instant::now();
- time_to_acquire += Python::with_gil(move |_py| start.elapsed());
- {
- let mut cm = (*contention_metric).write();
- *cm = time_to_acquire.as_micros() as f32
- / runtime.elapsed().as_micros() as f32;
+ let mut runtime = Instant::now();
+ let mut handle: Option<thread::JoinHandle<Duration>> = None;
+ loop {
+ match recv.recv_timeout(interval) {
+ Ok(message) => match message {
+ Message::Stop => break,
+ Message::Reset => {
+ time_to_acquire = Duration::from_millis(0);
+ runtime = Instant::now();
+ *(*contention_metric).write() = 0_f32;
+ }
+ },
+ Err(RecvTimeoutError::Disconnected) => break,
+ Err(RecvTimeoutError::Timeout) => match handle {
+ Some(hdl) => {
+ if hdl.is_finished() {
+ time_to_acquire += hdl.join().unwrap();
+ let mut cm = (*contention_metric).write();
+ *cm = time_to_acquire.as_micros() as f32
+ / runtime.elapsed().as_micros() as f32;
+ handle = None;
+ } else {
+ handle = Some(hdl);
+ }
+ }
+ None => {
+ handle = Some(thread::spawn(move || {
+ let start = Instant::now();
+ Python::with_gil(move |_py| start.elapsed())
+ }));
+ }
+ },
  }
  }
  })
@@ -110,7 +165,7 @@ impl KnockKnock {
  match take(&mut self.handle) {
  Some(handle) => {
  if let Some(send) = take(&mut self.channel) {
- send.send(false)
+ send.send(Message::Stop)
  .map_err(|e| PyBrokenPipeError::new_err(e.to_string()))?;
 
  let start = Instant::now();

diff --git a/tests/test_knockknock.py b/tests/test_knockknock.py
@@ -9,21 +9,6 @@
 N_PTS = 4096
 
 
-def flaky(n_tries=10):
- def wrapper(func):
- def _wrapper(*args, **kwargs):
- for _ in range(n_tries - 1):
- try:
- return func(*args, **kwargs)
- except:
- pass
- return func(*args, **kwargs)
-
- return _wrapper
-
- return wrapper
-
-
 def a_lotta_gil():
  """Keep the GIL busy"""
  for i in range(100_000_000):
@@ -52,13 +37,11 @@ def _run(target):
  return knocker
 
 
-@pytest.mark.xfail(raises=TimeoutError)
-@flaky()
 def test_knockknock_busy():
  knocker = _run(a_lotta_gil)
 
  try:
- # usually ~0.9 on linux ~0.6 on windows
+ # usually ~0.9, but sometimes ~0.6 on Mac
  assert knocker.contention_metric > 0.6
 
  # Now wait for it to 'cool' back down
@@ -70,23 +53,33 @@ def test_knockknock_busy():
  prev_cm = knocker.contention_metric
 
  # ~0.15 oN mY MaChInE.
- assert knocker.contention_metric < 0.2
+ assert knocker.contention_metric < 0.3
  finally:
  knocker.stop()
 
 
-@pytest.mark.xfail(raises=TimeoutError)
-@flaky()
 def test_knockknock_available_gil():
  knocker = _run(a_little_gil)
 
  try:
- # usually ~0.001 on linux and ~0.05 on windows
+ # usually ~0.002
  assert knocker.contention_metric < 0.06
  finally:
  knocker.stop()
 
 
+def test_knockknock_reset_contention_metric():
+ knocker = _run(a_lotta_gil)
+
+ try:
+ assert knocker.contention_metric > 0.6
+ knocker.reset_contention_metric()
+ assert knocker.contention_metric < 0.001
+
+ finally:
+ knocker.stop()
+
+
 # Manual verification with py-spy
 # busy should give high GIL %
 if __name__ == "__main__":