feat: enhance agentic test script with extended timeouts and detailed logging

jruokola · jruokola · commit 1c25e8203d46 · 2025-11-17T23:33:28.000+02:00
Updated test_agentic_mcp.py with improved debugging and inspection capabilities:

Timeout Changes:
- Increased all test timeouts from 60/90s to 300s (5 minutes)
- Allows agentic workflows to complete multi-step reasoning without interruption

File Logging:
- Created test_output/ directory for detailed test logs
- Each test saves input query and output to timestamped log file
- Log file naming: {test_number:02d}_{tool_name}_{timestamp}.log
- Logs include: test metadata, input query, full output, duration, status

Enhanced Output:
- Shows log filename after each test completes
- Summary displays test_output directory location
- Added test output directory to configuration display
- Preserves full error traces and timeouts in log files

Benefits:
- Easier debugging of agentic workflow issues
- Can inspect full responses without console scrolling
- Timestamped logs for comparing runs
- Complete error context preserved for analysis

Files changed:
- test_agentic_mcp.py: Added file logging and increased timeouts
- .gitignore: Added /test_output/ to ignore test logs
diff --git a/.gitignore b/.gitignore
@@ -179,3 +179,4 @@ OUROBOROS.md
 SESSION-MEMORY.md
 .serena/
 .codegraph.toml
+/test_output/
diff --git a/test_agentic_mcp.py b/test_agentic_mcp.py
@@ -83,33 +83,37 @@
 AGENTIC_TESTS = [
     ("agentic_code_search",
      "How is configuration loaded in this codebase? Find all config loading mechanisms.",
-     60),
+     300),
 
     ("agentic_dependency_analysis",
      "Analyze the dependency chain for the AgenticOrchestrator. What does it depend on?",
-     60),
+     300),
 
     ("agentic_call_chain_analysis",
      "Trace the call chain from execute_agentic_workflow to the graph analysis tools",
-     60),
+     300),
 
     ("agentic_architecture_analysis",
      "Analyze the architecture of the MCP server. Find coupling metrics and hub nodes.",
-     90),
+     300),
 
     ("agentic_api_surface_analysis",
      "What is the public API surface of the GraphToolExecutor?",
-     60),
+     300),
 
     ("agentic_context_builder",
      "Gather comprehensive context about the tier-aware prompt selection system",
-     90),
+     300),
 
     ("agentic_semantic_question",
      "How does the LRU cache work in GraphToolExecutor? What gets cached and when?",
-     60),
+     300),
 ]
 
+# Create output directory for test logs
+TEST_OUTPUT_DIR = Path(__file__).resolve().parent / "test_output"
+TEST_OUTPUT_DIR.mkdir(exist_ok=True)
+
 
 def print_config():
     """Print resolved configuration."""
@@ -146,6 +150,8 @@ def print_config():
     db = os.environ.get("CODEGRAPH_SURREALDB_DATABASE", "codegraph")
     print(f"\n  SurrealDB: {url}")
     print(f"  Namespace/DB: {ns}/{db}")
+
+    print(f"\n  📁 Test Output Directory: {TEST_OUTPUT_DIR}")
     print("=" * 72)
 
 
@@ -222,8 +228,24 @@ async def run_stdio_tests():
                     print(f"  Query: {query[:60]}...")
                     print(f"  Timeout: {timeout}s")
 
+                    # Create log file for this test
+                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                    log_file = TEST_OUTPUT_DIR / f"{idx:02d}_{tool_name}_{timestamp}.log"
+
                     start_time = asyncio.get_event_loop().time()
 
+                    # Write input to log file
+                    with open(log_file, "w", encoding="utf-8") as f:
+                        f.write("=" * 80 + "\n")
+                        f.write(f"Test: {tool_name}\n")
+                        f.write(f"Timestamp: {timestamp}\n")
+                        f.write(f"Timeout: {timeout}s\n")
+                        f.write("=" * 80 + "\n\n")
+                        f.write("INPUT QUERY:\n")
+                        f.write("-" * 80 + "\n")
+                        f.write(query + "\n")
+                        f.write("-" * 80 + "\n\n")
+
                     try:
                         # Call tool with timeout
                         result = await asyncio.wait_for(
@@ -236,12 +258,23 @@ async def run_stdio_tests():
                         # Parse result
                         if result.content and len(result.content) > 0:
                             text_content = result.content[0].text
+
+                            # Write output to log file
+                            with open(log_file, "a", encoding="utf-8") as f:
+                                f.write("OUTPUT:\n")
+                                f.write("-" * 80 + "\n")
+                                f.write(text_content + "\n")
+                                f.write("-" * 80 + "\n\n")
+                                f.write(f"Duration: {duration:.1f}s\n")
+                                f.write(f"Status: SUCCESS\n")
+
                             try:
                                 data = json.loads(text_content)
                                 steps = data.get("total_steps", 0)
                                 final_answer = data.get("final_answer", "")
 
                                 print(f"  ✅ SUCCESS in {duration:.1f}s ({steps} steps)")
+                                print(f"  📝 Log saved: {log_file.name}")
                                 if final_answer:
                                     preview = final_answer[:100].replace('\n', ' ')
                                     print(f"     {preview}...")
@@ -254,14 +287,24 @@ async def run_stdio_tests():
                                 })
                             except json.JSONDecodeError:
                                 print(f"  ✅ SUCCESS in {duration:.1f}s (non-JSON response)")
+                                print(f"  📝 Log saved: {log_file.name}")
                                 results.append({
                                     "test": tool_name,
                                     "success": True,
                                     "duration": duration,
                                     "steps": 0
                                 })
                         else:
+                            with open(log_file, "a", encoding="utf-8") as f:
+                                f.write("OUTPUT:\n")
+                                f.write("-" * 80 + "\n")
+                                f.write("(Empty result)\n")
+                                f.write("-" * 80 + "\n\n")
+                                f.write(f"Duration: {duration:.1f}s\n")
+                                f.write(f"Status: FAILED (empty result)\n")
+
                             print(f"  ❌ FAILED: Empty result")
+                            print(f"  📝 Log saved: {log_file.name}")
                             results.append({
                                 "test": tool_name,
                                 "success": False,
@@ -271,7 +314,16 @@ async def run_stdio_tests():
 
                     except asyncio.TimeoutError:
                         duration = timeout
+                        with open(log_file, "a", encoding="utf-8") as f:
+                            f.write("OUTPUT:\n")
+                            f.write("-" * 80 + "\n")
+                            f.write(f"TIMEOUT after {timeout}s\n")
+                            f.write("-" * 80 + "\n\n")
+                            f.write(f"Duration: {duration:.1f}s\n")
+                            f.write(f"Status: TIMEOUT\n")
+
                         print(f"  ❌ TIMEOUT after {timeout}s")
+                        print(f"  📝 Log saved: {log_file.name}")
                         results.append({
                             "test": tool_name,
                             "success": False,
@@ -281,9 +333,22 @@ async def run_stdio_tests():
 
                     except Exception as e:
                         duration = asyncio.get_event_loop().time() - start_time
+                        import traceback
+                        error_trace = traceback.format_exc()
+
+                        with open(log_file, "a", encoding="utf-8") as f:
+                            f.write("OUTPUT:\n")
+                            f.write("-" * 80 + "\n")
+                            f.write(f"ERROR: {e}\n\n")
+                            f.write("Full traceback:\n")
+                            f.write(error_trace)
+                            f.write("-" * 80 + "\n\n")
+                            f.write(f"Duration: {duration:.1f}s\n")
+                            f.write(f"Status: ERROR\n")
+
                         print(f"  ❌ ERROR: {e}")
+                        print(f"  📝 Log saved: {log_file.name}")
                         print(f"\n  📋 Full error details:")
-                        import traceback
                         traceback.print_exc()
                         results.append({
                             "test": tool_name,
@@ -312,6 +377,7 @@ async def run_stdio_tests():
         print()
 
     print(f"\nTotal: {passed}/{total} passed")
+    print(f"\n📁 Detailed logs saved to: {TEST_OUTPUT_DIR}")
     print("=" * 72)
 
     return 0 if passed == total else 1
@@ -356,8 +422,24 @@ async def run_http_tests():
                     print(f"  Query: {query[:60]}...")
                     print(f"  Timeout: {timeout}s")
 
+                    # Create log file for this test
+                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                    log_file = TEST_OUTPUT_DIR / f"{idx:02d}_{tool_name}_{timestamp}.log"
+
                     start_time = asyncio.get_event_loop().time()
 
+                    # Write input to log file
+                    with open(log_file, "w", encoding="utf-8") as f:
+                        f.write("=" * 80 + "\n")
+                        f.write(f"Test: {tool_name}\n")
+                        f.write(f"Timestamp: {timestamp}\n")
+                        f.write(f"Timeout: {timeout}s\n")
+                        f.write("=" * 80 + "\n\n")
+                        f.write("INPUT QUERY:\n")
+                        f.write("-" * 80 + "\n")
+                        f.write(query + "\n")
+                        f.write("-" * 80 + "\n\n")
+
                     try:
                         # Call tool with timeout
                         result = await asyncio.wait_for(
@@ -370,12 +452,23 @@ async def run_http_tests():
                         # Parse result
                         if result.content and len(result.content) > 0:
                             text_content = result.content[0].text
+
+                            # Write output to log file
+                            with open(log_file, "a", encoding="utf-8") as f:
+                                f.write("OUTPUT:\n")
+                                f.write("-" * 80 + "\n")
+                                f.write(text_content + "\n")
+                                f.write("-" * 80 + "\n\n")
+                                f.write(f"Duration: {duration:.1f}s\n")
+                                f.write(f"Status: SUCCESS\n")
+
                             try:
                                 data = json.loads(text_content)
                                 steps = data.get("total_steps", 0)
                                 final_answer = data.get("final_answer", "")
 
                                 print(f"  ✅ SUCCESS in {duration:.1f}s ({steps} steps)")
+                                print(f"  📝 Log saved: {log_file.name}")
                                 if final_answer:
                                     preview = final_answer[:100].replace('\n', ' ')
                                     print(f"     {preview}...")
@@ -388,14 +481,24 @@ async def run_http_tests():
                                 })
                             except json.JSONDecodeError:
                                 print(f"  ✅ SUCCESS in {duration:.1f}s (non-JSON response)")
+                                print(f"  📝 Log saved: {log_file.name}")
                                 results.append({
                                     "test": tool_name,
                                     "success": True,
                                     "duration": duration,
                                     "steps": 0
                                 })
                         else:
+                            with open(log_file, "a", encoding="utf-8") as f:
+                                f.write("OUTPUT:\n")
+                                f.write("-" * 80 + "\n")
+                                f.write("(Empty result)\n")
+                                f.write("-" * 80 + "\n\n")
+                                f.write(f"Duration: {duration:.1f}s\n")
+                                f.write(f"Status: FAILED (empty result)\n")
+
                             print(f"  ❌ FAILED: Empty result")
+                            print(f"  📝 Log saved: {log_file.name}")
                             results.append({
                                 "test": tool_name,
                                 "success": False,
@@ -405,7 +508,16 @@ async def run_http_tests():
 
                     except asyncio.TimeoutError:
                         duration = timeout
+                        with open(log_file, "a", encoding="utf-8") as f:
+                            f.write("OUTPUT:\n")
+                            f.write("-" * 80 + "\n")
+                            f.write(f"TIMEOUT after {timeout}s\n")
+                            f.write("-" * 80 + "\n\n")
+                            f.write(f"Duration: {duration:.1f}s\n")
+                            f.write(f"Status: TIMEOUT\n")
+
                         print(f"  ❌ TIMEOUT after {timeout}s")
+                        print(f"  📝 Log saved: {log_file.name}")
                         results.append({
                             "test": tool_name,
                             "success": False,
@@ -415,9 +527,22 @@ async def run_http_tests():
 
                     except Exception as e:
                         duration = asyncio.get_event_loop().time() - start_time
+                        import traceback
+                        error_trace = traceback.format_exc()
+
+                        with open(log_file, "a", encoding="utf-8") as f:
+                            f.write("OUTPUT:\n")
+                            f.write("-" * 80 + "\n")
+                            f.write(f"ERROR: {e}\n\n")
+                            f.write("Full traceback:\n")
+                            f.write(error_trace)
+                            f.write("-" * 80 + "\n\n")
+                            f.write(f"Duration: {duration:.1f}s\n")
+                            f.write(f"Status: ERROR\n")
+
                         print(f"  ❌ ERROR: {e}")
+                        print(f"  📝 Log saved: {log_file.name}")
                         print(f"\n  📋 Full error details:")
-                        import traceback
                         traceback.print_exc()
                         results.append({
                             "test": tool_name,
@@ -456,6 +581,7 @@ async def run_http_tests():
         print()
 
     print(f"\nTotal: {passed}/{total} passed")
+    print(f"\n📁 Detailed logs saved to: {TEST_OUTPUT_DIR}")
     print("=" * 72)
 
     return 0 if passed == total else 1