Skip to content

Commit a019901

Browse files
shayne-fletchermeta-codesync[bot]
authored andcommitted
port v0 multi-process test to v1 (meta-pytorch#2002)
Summary: Pull Request resolved: meta-pytorch#2002 port stop-timeout / abort coverage from hyperactor_multiprocess into the v1 mesh layer. this adds a `SleepActor` helper and a new `actor_mesh::test_actor_mesh_stop_timeout` that overrides `ACTOR_SPAWN_MAX_IDLE`, spawns sleeping actors across a proc mesh, calls `stop()`, and asserts we get a timeout error within the expected window, proving hung actors are aborted via `Proc::destroy_and_wait` rather than waited out. the V0 test is left in place but re-documented to point at the new v1 test and to note the slight API difference (V0 reports abort counts, V1 surfaces the timeout as an error). Reviewed By: dulinriley Differential Revision: D87904499 fbshipit-source-id: a3c3953ee06ed5f7ca80b16fd73b4c4710572f54
1 parent 1e1f404 commit a019901

File tree

3 files changed

+120
-9
lines changed

3 files changed

+120
-9
lines changed

hyperactor_mesh/src/v1/actor_mesh.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ mod tests {
469469
use crate::v1::ActorMeshRef;
470470
use crate::v1::Name;
471471
use crate::v1::ProcMesh;
472+
use crate::v1::proc_mesh::ACTOR_SPAWN_MAX_IDLE;
472473
use crate::v1::proc_mesh::GET_ACTOR_STATE_MAX_IDLE;
473474
use crate::v1::testactor;
474475
use crate::v1::testing;
@@ -938,4 +939,97 @@ mod tests {
938939
n, count
939940
);
940941
}
942+
943+
/// Test that actors not responding within stop timeout are
944+
/// forcibly aborted. This is the V1 equivalent of
945+
/// hyperactor_multiprocess/src/proc_actor.rs::test_stop_timeout.
946+
#[async_timed_test(timeout_secs = 30)]
947+
#[cfg(fbcode_build)]
948+
async fn test_actor_mesh_stop_timeout() {
949+
hyperactor_telemetry::initialize_logging_for_test();
950+
951+
// Override ACTOR_SPAWN_MAX_IDLE to make test fast and
952+
// deterministic. ACTOR_SPAWN_MAX_IDLE is the maximum idle
953+
// time between status updates during mesh operations
954+
// (spawn/stop). When stop() is called, it waits for actors to
955+
// report they've stopped. If actors don't respond within this
956+
// timeout, they're forcibly aborted via JoinHandle::abort().
957+
// We set this to 1 second (instead of default 30s) so hung
958+
// actors (sleeping 5s in this test) get aborted quickly,
959+
// making the test fast.
960+
let config = hyperactor::config::global::lock();
961+
let _guard = config.override_key(ACTOR_SPAWN_MAX_IDLE, std::time::Duration::from_secs(1));
962+
963+
let instance = testing::instance().await;
964+
965+
// Create proc mesh with 2 replicas
966+
let meshes = testing::proc_meshes(instance, extent!(replicas = 2)).await;
967+
let proc_mesh = &meshes[1]; // Use ProcessAllocator version
968+
969+
// Spawn SleepActors across the mesh that will block longer
970+
// than timeout
971+
let sleep_mesh = proc_mesh
972+
.spawn::<testactor::SleepActor>(instance, "sleepers", &())
973+
.await
974+
.unwrap();
975+
976+
// Send each actor a message to sleep for 5 seconds (longer
977+
// than 1-second timeout)
978+
for actor_ref in sleep_mesh.values() {
979+
actor_ref
980+
.send(instance, std::time::Duration::from_secs(5))
981+
.unwrap();
982+
}
983+
984+
// Give actors time to start sleeping
985+
RealClock.sleep(std::time::Duration::from_millis(200)).await;
986+
987+
// Count how many actors we spawned (for verification later)
988+
let expected_actors = sleep_mesh.values().count();
989+
990+
// Now stop the mesh - actors won't respond in time, should be
991+
// aborted. Time this operation to verify abort behavior.
992+
let stop_start = RealClock.now();
993+
let result = sleep_mesh.stop(instance).await;
994+
let stop_duration = RealClock.now().duration_since(stop_start);
995+
996+
// Stop will return an error because actors didn't stop within
997+
// the timeout. This is expected - the actors were forcibly
998+
// aborted, and V1 reports this as an error.
999+
match result {
1000+
Ok(_) => {
1001+
// It's possible actors stopped in time, but unlikely
1002+
// given 5-second sleep vs 1-second timeout
1003+
tracing::warn!("Actors stopped gracefully (unexpected but ok)");
1004+
}
1005+
Err(ref e) => {
1006+
// Expected: timeout error indicating actors were aborted
1007+
let err_str = format!("{:?}", e);
1008+
assert!(
1009+
err_str.contains("Timeout"),
1010+
"Expected Timeout error, got: {:?}",
1011+
e
1012+
);
1013+
tracing::info!(
1014+
"Stop timed out as expected for {} actors, they were aborted",
1015+
expected_actors
1016+
);
1017+
}
1018+
}
1019+
1020+
// Verify that stop completed quickly (~1-2 seconds for
1021+
// timeout + abort) rather than waiting the full 5 seconds for
1022+
// actors to finish sleeping. This proves actors were aborted,
1023+
// not waited for.
1024+
assert!(
1025+
stop_duration < std::time::Duration::from_secs(3),
1026+
"Stop took {:?}, expected < 3s (actors should have been aborted, not waited for)",
1027+
stop_duration
1028+
);
1029+
assert!(
1030+
stop_duration >= std::time::Duration::from_millis(900),
1031+
"Stop took {:?}, expected >= 900ms (should have waited for timeout)",
1032+
stop_duration
1033+
);
1034+
}
9411035
}

hyperactor_mesh/src/v1/testactor.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,7 @@ use hyperactor::Named;
2929
use hyperactor::PortRef;
3030
use hyperactor::RefClient;
3131
use hyperactor::Unbind;
32-
#[cfg(test)]
3332
use hyperactor::clock::Clock as _;
34-
#[cfg(test)]
3533
use hyperactor::clock::RealClock;
3634
use hyperactor::config;
3735
use hyperactor::config::global::Source;
@@ -159,6 +157,27 @@ impl Handler<ActorSupervisionEvent> for TestActorWithSupervisionHandling {
159157
}
160158
}
161159

160+
/// A test actor that sleeps when it receives a Duration message.
161+
/// Used for testing timeout and abort behavior.
162+
#[derive(Actor, Default, Debug)]
163+
#[hyperactor::export(
164+
spawn = true,
165+
handlers = [std::time::Duration],
166+
)]
167+
pub struct SleepActor;
168+
169+
#[async_trait]
170+
impl Handler<std::time::Duration> for SleepActor {
171+
async fn handle(
172+
&mut self,
173+
_cx: &Context<Self>,
174+
duration: std::time::Duration,
175+
) -> Result<(), anyhow::Error> {
176+
RealClock.sleep(duration).await;
177+
Ok(())
178+
}
179+
}
180+
162181
/// A message to forward to a visit list of ports.
163182
/// Each port removes the next entry, and adds it to the
164183
/// 'visited' list.

hyperactor_multiprocess/src/proc_actor.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,16 +1063,14 @@ mod tests {
10631063
}
10641064
}
10651065

1066-
// V0 test - V1 needs equivalent coverage. Tests that actors not
1066+
// V0 test - V1 has equivalent coverage. Tests that actors not
10671067
// responding within stop timeout are forcibly aborted
10681068
// (JoinHandle::abort). Spawns SleepActors that block for 5
10691069
// seconds, calls stop() with 1-second timeout, verifies abort
1070-
// counts and "aborting JoinHandle" logs. V1 uses the same
1071-
// underlying mechanism (Proc::destroy_and_wait) but lacks test
1072-
// coverage. V1's ActorMesh::stop() uses global config timeout
1073-
// (ACTOR_SPAWN_MAX_IDLE) and doesn't expose stopped/aborted
1074-
// counts, but equivalent tests should verify timeout and abort
1075-
// behavior work correctly.
1070+
// counts and "aborting JoinHandle" logs. V1 equivalent:
1071+
// hyperactor_mesh/src/v1/actor_mesh.rs::test_actor_mesh_stop_timeout.
1072+
// Both use the same underlying mechanism (Proc::destroy_and_wait),
1073+
// but V1 returns Err(Timeout) instead of Ok with abort counts.
10761074
#[tracing_test::traced_test]
10771075
#[tokio::test]
10781076
#[cfg_attr(not(fbcode_build), ignore)]

0 commit comments

Comments
 (0)