Skip to content

Commit bbe763f

Browse files
authored
fix: prevent refresh coordinator deadlock and reset missing-env reporting on reconfigure (#409)
## Summary Two high-impact bug fixes for the refresh coordinator and missing-env reporting state machine. ## Changes - **#396 — `force_complete_request` deadlock:** Added `RefreshSafetyGuard` that ensures the coordinator transitions back to `Idle` if the refresh-owning thread exits without constructing a `RefreshCompletionGuard` (e.g., if `begin_completion()` panics). Also updated `force_complete_request()` to handle the `Running` state and log mismatched-key cases instead of silently ignoring them. - **#395 — `MISSING_ENVS_REPORTING_STATE` permanently exhausted:** Reset `MISSING_ENVS_REPORTING_STATE` to `MISSING_ENVS_AVAILABLE` inside `handle_configure` when the generation is bumped. The store is placed inside the configuration write lock to avoid a TOCTOU window with concurrent refresh threads. - Added 4 tests covering the new recovery paths and reset behavior. Fixes #396 Fixes #395
1 parent 2a12535 commit bbe763f

1 file changed

Lines changed: 212 additions & 3 deletions

File tree

crates/pet/src/jsonrpc.rs

Lines changed: 212 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,67 @@ impl RefreshCoordinator {
217217
*state = RefreshCoordinatorState::Idle;
218218
self.changed.notify_all();
219219
}
220+
RefreshCoordinatorState::Running(active) if active.key == *key => {
221+
// Recovery path: if begin_completion() panicked, the state was
222+
// restored to Running before the unwind. Transition to Idle so
223+
// waiters are not stuck forever.
224+
*state = RefreshCoordinatorState::Idle;
225+
self.changed.notify_all();
226+
}
220227
RefreshCoordinatorState::Idle => {}
221-
_ => {}
228+
RefreshCoordinatorState::Completing(active) => {
229+
// Mismatched key — another refresh owns this state. Log and
230+
// leave it alone; the owning refresh will clean up.
231+
error!(
232+
"force_complete_request called with mismatched key while coordinator was Completing; caller key: {:?}, active key: {:?}",
233+
key,
234+
active.key
235+
);
236+
}
237+
RefreshCoordinatorState::Running(active) => {
238+
// Mismatched key — another refresh owns this state. Log and
239+
// leave it alone; the owning refresh will clean up.
240+
error!(
241+
"force_complete_request called with mismatched key while coordinator was Running; caller key: {:?}, active key: {:?}",
242+
key,
243+
active.key
244+
);
245+
}
246+
}
247+
}
248+
}
249+
250+
/// Safety guard created when a refresh thread takes ownership of the `Running`
251+
/// state. If the thread exits the `Start` arm without ever constructing a
252+
/// `RefreshCompletionGuard` (e.g., because `begin_completion` panics), this
253+
/// guard calls `force_complete_request` to transition the coordinator back to
254+
/// `Idle`, preventing a permanent deadlock.
255+
struct RefreshSafetyGuard<'a> {
256+
coordinator: &'a RefreshCoordinator,
257+
key: RefreshKey,
258+
disarmed: bool,
259+
}
260+
261+
impl<'a> RefreshSafetyGuard<'a> {
262+
fn new(coordinator: &'a RefreshCoordinator, key: RefreshKey) -> Self {
263+
Self {
264+
coordinator,
265+
key,
266+
disarmed: false,
267+
}
268+
}
269+
270+
/// Disarm the safety guard once a `RefreshCompletionGuard` takes over
271+
/// responsibility for the state transition.
272+
fn disarm(&mut self) {
273+
self.disarmed = true;
274+
}
275+
}
276+
277+
impl Drop for RefreshSafetyGuard<'_> {
278+
fn drop(&mut self) {
279+
if !self.disarmed {
280+
self.coordinator.force_complete_request(&self.key);
222281
}
223282
}
224283
}
@@ -530,6 +589,11 @@ pub fn handle_configure(context: Arc<Context>, id: u32, params: Value) {
530589
state.config.cache_directory = Some(cache_directory);
531590
}
532591
state.generation += 1;
592+
// Reset missing-env reporting so that the next refresh
593+
// after reconfiguration can trigger it again (Fixes #395).
594+
// Done inside the write lock to avoid a TOCTOU window with
595+
// concurrent refresh threads reading the generation.
596+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
533597
trace!(
534598
"Configuring locators with generation {}: {:?}",
535599
state.generation,
@@ -886,6 +950,15 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
886950
context.refresh_coordinator.wait_until_idle();
887951
}
888952
RefreshRegistration::Start => {
953+
// Safety guard: if anything in this arm panics
954+
// (including begin_completion), force the
955+
// coordinator back to Idle so waiters are not
956+
// stuck forever.
957+
// Move refresh_key into the guard to avoid an
958+
// extra clone of potentially large search_paths.
959+
let mut safety_guard =
960+
RefreshSafetyGuard::new(&context.refresh_coordinator, refresh_key);
961+
889962
let refresh_result = panic::catch_unwind(AssertUnwindSafe(|| {
890963
execute_refresh(
891964
context.as_ref(),
@@ -899,8 +972,9 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
899972
let refresh_result = execution.result.clone();
900973
let mut completion_guard = RefreshCompletionGuard::begin(
901974
&context.refresh_coordinator,
902-
&refresh_key,
975+
&safety_guard.key,
903976
);
977+
safety_guard.disarm();
904978
finish_refresh_replies(&mut completion_guard, &refresh_result);
905979
report_refresh_follow_up(execution);
906980
}
@@ -911,8 +985,9 @@ pub fn handle_refresh(context: Arc<Context>, id: u32, params: Value) {
911985
);
912986
let mut completion_guard = RefreshCompletionGuard::begin(
913987
&context.refresh_coordinator,
914-
&refresh_key,
988+
&safety_guard.key,
915989
);
990+
safety_guard.disarm();
916991
finish_refresh_errors(
917992
&mut completion_guard,
918993
"Refresh failed unexpectedly",
@@ -1898,4 +1973,138 @@ mod tests {
18981973
assert_eq!(result_config.executables, Some(vec![executable]));
18991974
assert!(matches!(search_scope, Some(SearchScope::Workspace)));
19001975
}
1976+
1977+
/// Test for #396: force_complete_request recovers from Running state.
1978+
/// When begin_completion() cannot be reached (e.g., the thread panics before
1979+
/// constructing a RefreshCompletionGuard), force_complete_request must still
1980+
/// transition Running → Idle to unblock waiters.
1981+
#[test]
1982+
fn test_force_complete_request_recovers_from_running_state() {
1983+
let coordinator = RefreshCoordinator::default();
1984+
let key = make_refresh_key(1, RefreshOptions::default());
1985+
1986+
// State → Running(key)
1987+
assert!(matches!(
1988+
coordinator.register_request(1, key.clone()),
1989+
RefreshRegistration::Start
1990+
));
1991+
1992+
// Simulate recovery: force_complete_request from Running state.
1993+
coordinator.force_complete_request(&key);
1994+
1995+
// Verify we're back to Idle and can start a new refresh.
1996+
assert!(matches!(
1997+
coordinator.register_request(2, key.clone()),
1998+
RefreshRegistration::Start
1999+
));
2000+
}
2001+
2002+
/// Test for #396: RefreshSafetyGuard transitions Running → Idle on drop
2003+
/// when begin_completion is never reached.
2004+
#[test]
2005+
fn test_safety_guard_recovers_running_state_on_drop() {
2006+
let coordinator = Arc::new(RefreshCoordinator::default());
2007+
let key = make_refresh_key(1, RefreshOptions::default());
2008+
let other_key = make_refresh_key(
2009+
1,
2010+
RefreshOptions {
2011+
search_kind: Some(PythonEnvironmentKind::Venv),
2012+
search_paths: None,
2013+
},
2014+
);
2015+
2016+
assert!(matches!(
2017+
coordinator.register_request(1, key.clone()),
2018+
RefreshRegistration::Start
2019+
));
2020+
2021+
let (state_tx, state_rx) = mpsc::channel();
2022+
let waiter = {
2023+
let coordinator = coordinator.clone();
2024+
let other_key = other_key.clone();
2025+
thread::spawn(move || {
2026+
// Different key → returns Wait (not Joined).
2027+
assert!(matches!(
2028+
coordinator.register_request(2, other_key.clone()),
2029+
RefreshRegistration::Wait
2030+
));
2031+
state_tx.send("waiting").unwrap();
2032+
coordinator.wait_until_idle();
2033+
state_tx.send("idle").unwrap();
2034+
})
2035+
};
2036+
2037+
assert_eq!(state_rx.recv().unwrap(), "waiting");
2038+
2039+
// Create and immediately drop the safety guard without disarming it.
2040+
// This simulates the thread dying before begin_completion.
2041+
{
2042+
let _guard = RefreshSafetyGuard::new(&coordinator, key.clone());
2043+
}
2044+
2045+
// Waiter should be unblocked.
2046+
assert_eq!(state_rx.recv().unwrap(), "idle");
2047+
waiter.join().unwrap();
2048+
}
2049+
2050+
/// Test for #396: RefreshSafetyGuard does NOT interfere when disarmed
2051+
/// (normal path where RefreshCompletionGuard takes over).
2052+
#[test]
2053+
fn test_safety_guard_disarmed_does_not_interfere() {
2054+
let coordinator = RefreshCoordinator::default();
2055+
let key = make_refresh_key(1, RefreshOptions::default());
2056+
2057+
assert!(matches!(
2058+
coordinator.register_request(1, key.clone()),
2059+
RefreshRegistration::Start
2060+
));
2061+
2062+
{
2063+
let mut safety_guard = RefreshSafetyGuard::new(&coordinator, key.clone());
2064+
let mut completion_guard = RefreshCompletionGuard::begin(&coordinator, &key);
2065+
safety_guard.disarm();
2066+
let ids = completion_guard.drain_request_ids();
2067+
assert_eq!(ids, vec![1]);
2068+
assert!(completion_guard.finish_if_no_pending());
2069+
}
2070+
2071+
// Should be Idle — can start a new refresh.
2072+
assert!(matches!(
2073+
coordinator.register_request(2, key.clone()),
2074+
RefreshRegistration::Start
2075+
));
2076+
}
2077+
2078+
/// Test for #395: configure resets MISSING_ENVS_REPORTING_STATE so that
2079+
/// subsequent refreshes can trigger missing-env reporting again.
2080+
#[test]
2081+
fn test_configure_resets_completed_missing_env_reporting() {
2082+
let _guard = MISSING_ENVS_TEST_LOCK.lock().unwrap();
2083+
2084+
let configuration = Arc::new(RwLock::new(ConfigurationState {
2085+
generation: 1,
2086+
config: Configuration::default(),
2087+
}));
2088+
2089+
// Simulate a completed first refresh.
2090+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2091+
assert!(try_begin_missing_env_reporting(configuration.as_ref(), 1));
2092+
complete_missing_env_reporting(1);
2093+
2094+
// Missing-env reporting is now exhausted.
2095+
assert!(!try_begin_missing_env_reporting(configuration.as_ref(), 1));
2096+
2097+
// Simulate what handle_configure does: bump generation and reset.
2098+
{
2099+
let mut state = configuration.write().unwrap();
2100+
state.generation = 2;
2101+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2102+
}
2103+
2104+
// Missing-env reporting should work again for the new generation.
2105+
assert!(try_begin_missing_env_reporting(configuration.as_ref(), 2));
2106+
2107+
// Cleanup.
2108+
MISSING_ENVS_REPORTING_STATE.store(MISSING_ENVS_AVAILABLE, Ordering::Release);
2109+
}
19012110
}

0 commit comments

Comments
 (0)