system_actor: audit tests (meta-pytorch#1999)

shayne-fletcher · meta-codesync[bot] · commit 1e1f40469b9b · 2025-11-26T18:09:05.000-08:00
Summary: Pull Request resolved: meta-pytorch#1999 this change applies the same V0->V1 test audit to 'system_actor.rs' that we did for 'proc_actor.rs'. i went through all 7 tests in this file and tagged each one as V0-specific, with no V1 equivalent: they all exercise `SystemActor`-centric behavior that simply doesn’t exist in the V1 design (world-level supervision state, world orchestration via hosts joining and `UpsertWorld`, snapshot filtering, and `ReportingRouter`/dynamic address updates). the new comments spell out what each test is validating in V0 and why there is no direct V1 analog, and where relevant they point at the closest V1 mechanism (for example, undeliverable handling now covered by `actor_mesh::test_undeliverable_message_return`). there are no behavior changes here; this is purely test documentation and migration context for the remaining V0 multiprocess tests in 'system_actor.rs'. Reviewed By: pzhan9 Differential Revision: D87890512 fbshipit-source-id: 389e9070de428de20e6b79b87ecfdcdced989703
diff --git a/hyperactor_multiprocess/src/system_actor.rs b/hyperactor_multiprocess/src/system_actor.rs
@@ -1890,6 +1890,14 @@ mod tests {
         }
     }
 
+    // V0-specific test - no V1 equivalent. Unit test for
+    // SystemSupervisionState which tracks proc health and failed
+    // actors centrally at world level. Tests heartbeat timeout
+    // detection (marks procs expired if no heartbeat within timeout)
+    // and failed actor aggregation. V1 does not have centralized
+    // supervision state - V1 uses local supervision where actors
+    // handle ActorSupervisionEvent locally rather than reporting to a
+    // central SystemActor for world-level health monitoring.
     #[tokio::test]
     async fn test_supervision_state() {
         let mut sv = SystemSupervisionState::new(Duration::from_secs(1));
@@ -1989,6 +1997,16 @@ mod tests {
         );
     }
 
+    // V0-specific test - no V1 equivalent. Tests SystemActor world
+    // orchestration where hosts can join before world is created.
+    // Flow: hosts send Join messages → queued by SystemActor →
+    // UpsertWorld defines world topology → SystemActor sends
+    // SpawnProc messages telling each host which procs to spawn.
+    // Verifies correct proc assignment across hosts. V1 does not have
+    // this orchestration model - V1 uses coordinated ProcMesh
+    // allocation where meshes are allocated in one operation, not
+    // assembled from hosts independently joining a central
+    // SystemActor.
     #[tracing_test::traced_test]
     #[tokio::test]
     async fn test_host_join_before_world() {
@@ -2064,6 +2082,14 @@ mod tests {
         }
     }
 
+    // V0-specific test - no V1 equivalent. Tests SystemActor world
+    // orchestration where world is created before hosts join (reverse
+    // order of test_host_join_before_world). Flow: UpsertWorld
+    // defines topology → hosts send Join messages → SystemActor
+    // immediately sends SpawnProc messages. Tests that join order
+    // doesn't matter. V1 does not have this orchestration model - V1
+    // uses coordinated ProcMesh allocation where meshes are allocated
+    // in one operation.
     #[tokio::test]
     async fn test_host_join_after_world() {
         // Spins up a new world with 2 hosts, with 3 procs each.
@@ -2138,6 +2164,12 @@ mod tests {
         }
     }
 
+    // V0-specific test - no V1 equivalent. Unit test for
+    // SystemSnapshotFilter which filters worlds by name and labels
+    // when querying SystemActor. Tests world_matches() and
+    // labels_match() logic. V1 does not have SystemActor or
+    // SystemSnapshot - V1 uses mesh-based iteration and state queries
+    // instead.
     #[test]
     fn test_snapshot_filter() {
         let test_world = World::new(
@@ -2176,6 +2208,13 @@ mod tests {
         ));
     }
 
+    // V0-specific test - no V1 equivalent. Tests SystemActor
+    // supervision behavior when mailbox server crashes: undeliverable
+    // messages are handled AND system supervision detects the
+    // unhealthy world state. V1 does not have SystemActor or world
+    // supervision. V1 undeliverable message handling (without
+    // supervision) is tested in
+    // hyperactor_mesh/src/v1/actor_mesh.rs::test_undeliverable_message_return.
     #[tokio::test]
     async fn test_undeliverable_message_return() {
         // System can't send a message to a remote actor because the
@@ -2349,6 +2388,13 @@ mod tests {
         ));
     }
 
+    // V0-specific test - no V1 equivalent. Tests SystemActor stop
+    // when system is empty (no worlds). Sends SystemMessage::Stop to
+    // central SystemActor which coordinates shutdown of all worlds.
+    // V1 does not have a central SystemActor - V1 uses mesh-level
+    // stop operations (ProcMesh::stop(), HostMesh::shutdown()) where
+    // you stop individual meshes rather than a system-wide
+    // coordinator.
     #[tokio::test]
     async fn test_stop_fast() -> Result<()> {
         let server_handle = System::serve(
@@ -2380,6 +2426,12 @@ mod tests {
         Ok(())
     }
 
+    // V0-specific test - no V1 equivalent. Tests ReportingRouter's
+    // UpdateAddress behavior in simnet mode. When messages are sent,
+    // post_update_address() sends MailboxAdminMessage::UpdateAddress
+    // to update address caches with simnet source routing info. V1
+    // does not have ReportingRouter or dynamic address updates - V1
+    // uses static/direct addressing.
     #[tokio::test]
     async fn test_update_sim_address() {
         simnet::start();