From 1b200cf47338d5c56c8018d2f8c8f62c1d03a293 Mon Sep 17 00:00:00 2001 From: Avikalp Kumar Gupta Date: Fri, 24 Oct 2025 22:54:04 -0700 Subject: [PATCH 1/4] feat: Add graph data export feature (v1.1.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ability to export complete networkx graph data structure to file formats that other programs can consume programmatically. New Features: - Add --format option to choose between 'html' (default) and 'graph' output - Add --graph-format option to select serialization (json/pickle/graphml) - Support for JSON export (human-readable, widely compatible) - Support for Pickle export (Python-native, exact data preservation) - Support for GraphML export (standard graph format for analysis tools) New Files: - diffgraph/graph_export.py: Core export/import functionality - test_graph_export.py: Comprehensive test suite - example_usage.py: Example script for using exported data - test_cli_manual.sh: Automated testing script - GRAPH_EXPORT_FEATURE.md: Feature documentation - TESTING_GUIDE.md: Testing instructions Modified Files: - diffgraph/cli.py: Added new CLI options and conditional output logic - diffgraph/graph_manager.py: Added export_to_dict() method - README.md: Updated documentation with new features - CHANGELOG.md: Added v1.1.0 release notes - setup.py: Bumped version to 1.1.0 - diffgraph/__init__.py: Updated version to 1.1.0 Technical Details: - Exported data includes file nodes, component nodes, graphs, and metadata - All formats support round-trip (export โ†’ load) with data integrity - NetworkX graphs serialized using node-link format for compatibility - Backward compatible: existing HTML functionality unchanged Testing: - All unit tests pass (test_graph_export.py) - Automated test suite validates all formats - Example usage script demonstrates practical use cases --- CHANGELOG.md | 28 ++++ GRAPH_EXPORT_FEATURE.md | 187 +++++++++++++++++++++++ README.md | 61 +++++++- TESTING_GUIDE.md | 302 +++++++++++++++++++++++++++++++++++++ diffgraph/__init__.py | 2 +- diffgraph/cli.py | 62 +++++--- diffgraph/graph_export.py | 273 +++++++++++++++++++++++++++++++++ diffgraph/graph_manager.py | 59 +++++++- example_usage.py | 88 +++++++++++ setup.py | 2 +- test_cli_manual.sh | 145 ++++++++++++++++++ test_graph_export.py | 115 ++++++++++++++ 12 files changed, 1296 insertions(+), 28 deletions(-) create mode 100644 GRAPH_EXPORT_FEATURE.md create mode 100644 TESTING_GUIDE.md create mode 100644 diffgraph/graph_export.py create mode 100644 example_usage.py create mode 100755 test_cli_manual.sh create mode 100644 test_graph_export.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 491e233..368db30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,34 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] - 2025-10-24 + +### Added +- **Graph Data Export Feature**: Export complete networkx graph data structure to file + - New `--format` option to choose between HTML and graph output formats + - New `--graph-format` option to select serialization format (json, pickle, graphml) + - Support for JSON export (human-readable, widely compatible) + - Support for Pickle export (Python-native, preserves exact data structures) + - Support for GraphML export (standard graph format for analysis tools) + - New `diffgraph/graph_export.py` module with export/import functions + - `export_to_dict()` method added to GraphManager for serialization + - `load_graph_from_json()` and `load_graph_from_pickle()` functions for loading exported data +- Comprehensive test suite (`test_graph_export.py`) for graph export functionality +- Example usage script (`example_usage.py`) demonstrating how to use exported data +- Automated test script (`test_cli_manual.sh`) for easy feature validation +- Documentation: `GRAPH_EXPORT_FEATURE.md` and `TESTING_GUIDE.md` + +### Changed +- Updated `--output` option description to reflect format-aware default paths +- Enhanced README with graph export documentation and usage examples +- Updated feature list to include graph data export capabilities + +### Technical Details +- Exported data includes file nodes, component nodes, dependency graphs, and metadata +- All graph data can be loaded back into GraphManager for programmatic analysis +- NetworkX graphs are serialized using node-link format for compatibility +- Backward compatible: existing HTML output functionality unchanged + ## [1.0.0] - 2025-08-06 ### Changed diff --git a/GRAPH_EXPORT_FEATURE.md b/GRAPH_EXPORT_FEATURE.md new file mode 100644 index 0000000..f75ec25 --- /dev/null +++ b/GRAPH_EXPORT_FEATURE.md @@ -0,0 +1,187 @@ +# Graph Export Feature + +## Overview + +The DiffGraph CLI now supports exporting the complete networkx graph data structure directly to a file, allowing other programs to programmatically access and analyze the code change data. + +## What's New + +### CLI Options + +- `--format` / `-f`: Choose output format (`html` or `graph`) +- `--graph-format`: Choose serialization format for graph export (`json`, `pickle`, or `graphml`) +- `--output` / `-o`: Output file path (auto-detects extension based on format) + +### Usage Examples + +```bash +# Export as JSON (default for graph format) +wild diff --format graph --output analysis.json + +# Export as pickle +wild diff --format graph --graph-format pickle --output analysis.pkl + +# Export as GraphML +wild diff --format graph --graph-format graphml --output analysis.graphml + +# HTML output still works as before (default) +wild diff --output report.html +``` + +## Exported Data Structure + +The exported graph data includes: + +1. **File Nodes**: All analyzed files with their metadata + - Path + - Status (pending/processing/processed/error) + - Change type (added/deleted/modified/unchanged) + - Summary + - Components list + +2. **Component Nodes**: All code components (classes, functions, methods) + - Name + - File path + - Change type + - Component type (container/function/method) + - Parent component (for nested components) + - Summary + - Dependencies + - Dependents + +3. **Graph Structures**: NetworkX directed graphs + - File dependency graph + - Component dependency graph + +4. **Metadata** + - Version information + - Processing status + - List of processed files + +## JSON Format Example + +```json +{ + "version": "1.0", + "file_nodes": { + "app/main.py": { + "path": "app/main.py", + "status": "processed", + "change_type": "modified", + "summary": "Modified main application file", + "error": null, + "components": [] + } + }, + "component_nodes": { + "app/main.py::MyClass": { + "name": "MyClass", + "file_path": "app/main.py", + "change_type": "modified", + "component_type": "container", + "parent": null, + "summary": "Main application class", + "dependencies": [], + "dependents": [] + } + }, + "file_graph": { ... }, + "component_graph": { ... }, + "processed_files": ["app/main.py"] +} +``` + +## Using Exported Data + +### Loading Graph Data + +```python +from diffgraph.graph_export import load_graph_from_json + +# Load exported data +graph_manager = load_graph_from_json('analysis.json') + +# Access file information +for file_path, file_node in graph_manager.file_nodes.items(): + print(f"{file_path}: {file_node.change_type.value}") + +# Access component information +for component_id, component in graph_manager.component_nodes.items(): + print(f"{component.name}: {len(component.dependencies)} dependencies") +``` + +### Analyzing with NetworkX + +```python +import networkx as nx + +# Get the component dependency graph +graph = graph_manager.component_graph + +# Find most connected components +degree_centrality = nx.degree_centrality(graph) +most_connected = max(degree_centrality.items(), key=lambda x: x[1]) + +# Find cycles +try: + cycles = nx.find_cycle(graph) + print(f"Found circular dependencies: {cycles}") +except nx.NetworkXNoCycle: + print("No circular dependencies found") +``` + +## Implementation Details + +### New Files + +- `diffgraph/graph_export.py`: Core export/import functionality + - `export_graph()`: Main export function + - `export_graph_to_json()`: JSON serialization + - `export_graph_to_pickle()`: Pickle serialization + - `export_graph_to_graphml()`: GraphML serialization + - `load_graph_from_json()`: Load from JSON + - `load_graph_from_pickle()`: Load from pickle + +### Modified Files + +- `diffgraph/cli.py`: Added new CLI options and conditional output logic +- `diffgraph/graph_manager.py`: Added `export_to_dict()` method +- `README.md`: Updated documentation with new features + +### Test Files + +- `test_graph_export.py`: Comprehensive tests for export/import functionality +- `example_usage.py`: Example script showing how to use exported data + +## Benefits + +1. **Programmatic Access**: Other tools can now consume DiffGraph analysis results +2. **Data Persistence**: Save analysis for later review or comparison +3. **Integration**: Easy integration with CI/CD pipelines and automated workflows +4. **Flexibility**: Multiple format options for different use cases +5. **Compatibility**: Standard formats (JSON, GraphML) work with various tools + +## Testing + +Run the test suite: + +```bash +python test_graph_export.py +``` + +Try the example: + +```bash +# Export some changes +wild diff --format graph --output my-changes.json + +# Analyze the exported data +python example_usage.py my-changes.json +``` + +## Backward Compatibility + +All existing functionality is preserved. The default behavior remains unchanged: +- Default output format is still HTML +- Existing CLI options work as before +- No breaking changes to the API diff --git a/README.md b/README.md index ee66c7b..60dc238 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ DiffGraph-CLI is a powerful command-line tool that visualizes code changes using - ๐Ÿ“Š Visualizes code changes as a dependency graph - ๐Ÿค– AI-powered analysis of code changes +- ๐Ÿ’พ Export graph data in multiple formats (JSON, Pickle, GraphML) - ๐ŸŒ™ Dark mode support - ๐Ÿ“ Markdown-formatted summaries - ๐Ÿ” Syntax highlighting for code blocks @@ -56,17 +57,30 @@ This will: ### Command-line Options - `--api-key`: Specify your OpenAI API key (defaults to OPENAI_API_KEY environment variable) -- `--output` or `-o`: Specify the output HTML file path (default: diffgraph.html) +- `--output` or `-o`: Specify the output file path (default: diffgraph.html for HTML, diffgraph.json for graph) +- `--format` or `-f`: Output format: `html` (default) or `graph` +- `--graph-format`: Graph serialization format when using `--format graph`: `json` (default), `pickle`, or `graphml` - `--no-open`: Don't automatically open the HTML report in browser - `--version`: Show version information -Example: +Examples: ```bash +# Generate HTML report (default) wild --output my-report.html --no-open + +# Export graph data as JSON +wild --format graph --output graph-data.json + +# Export graph data as pickle +wild --format graph --graph-format pickle --output graph-data.pkl + +# Export graph data as GraphML +wild --format graph --graph-format graphml --output graph-data.graphml ``` -## ๐Ÿ“Š Example Output +## ๐Ÿ“Š Output Formats +### HTML Report (default) The generated HTML report includes: - A summary of code changes - A Mermaid.js dependency graph @@ -74,6 +88,47 @@ The generated HTML report includes: - Dark mode support - Responsive design for all screen sizes +### Graph Data Export +When using `--format graph`, the tool exports the complete networkx graph data structure, allowing other programs to programmatically analyze the code changes: + +**Supported formats:** +- **JSON** (default): Human-readable, widely compatible format +- **Pickle**: Python-specific format that preserves exact data structures +- **GraphML**: Standard graph format compatible with many graph analysis tools + +**Exported data includes:** +- File-level dependency graph with metadata (status, change type, summary) +- Component-level dependency graph (functions, classes, methods) +- Complete analysis results for each file and component +- Relationships between components (dependencies and dependents) + +**Example: Loading and using exported graph data** +```python +from diffgraph.graph_export import load_graph_from_json +import networkx as nx + +# Load the exported graph data +graph_manager = load_graph_from_json('diffgraph.json') + +# Access file nodes +for file_path, file_node in graph_manager.file_nodes.items(): + print(f"File: {file_path}") + print(f" Status: {file_node.status.value}") + print(f" Change Type: {file_node.change_type.value}") + print(f" Summary: {file_node.summary}") + +# Access component nodes +for component_id, component_node in graph_manager.component_nodes.items(): + print(f"Component: {component_node.name}") + print(f" Type: {component_node.component_type}") + print(f" Dependencies: {component_node.dependencies}") + +# Use networkx to analyze the graphs +print(f"Total files: {graph_manager.file_graph.number_of_nodes()}") +print(f"Total components: {graph_manager.component_graph.number_of_nodes()}") +print(f"Component dependencies: {graph_manager.component_graph.number_of_edges()}") +``` + ## ๐Ÿค Contributing Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/TESTING_GUIDE.md b/TESTING_GUIDE.md new file mode 100644 index 0000000..3866c7b --- /dev/null +++ b/TESTING_GUIDE.md @@ -0,0 +1,302 @@ +# Testing Guide for Graph Export Feature + +## Quick Start + +### Option 1: Run the Automated Test Script (Recommended) + +The easiest way to test all functionality: + +```bash +./test_cli_manual.sh +``` + +This will: +- โœ… Test graph export in all formats (JSON, Pickle, GraphML) +- โœ… Verify data integrity +- โœ… Test loading exported data +- โœ… Verify CLI options +- โœ… No API key required! + +### Option 2: Unit Tests + +Run the comprehensive unit tests: + +```bash +python test_graph_export.py +``` + +Expected output: `โœ… All tests passed!` + +### Option 3: Manual Testing with Real Changes + +If you have an OpenAI API key and want to test with actual code changes: + +```bash +# Set up your API key +export OPENAI_API_KEY="your-key-here" + +# Option A: Export graph data (new feature) +python -m diffgraph.cli diff --format graph --output analysis.json + +# Option B: Generate HTML (existing feature - should still work) +python -m diffgraph.cli diff --output report.html --no-open + +# Analyze the exported graph data +python example_usage.py analysis.json +``` + +## Testing Scenarios + +### Scenario 1: Test Without API Key + +Create sample data and test export/import: + +```bash +python << 'EOF' +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.graph_export import export_graph, load_graph_from_json + +# Create test data +gm = GraphManager() +gm.add_file('example.py', ChangeType.MODIFIED) +gm.add_component('MyClass', 'example.py', ChangeType.MODIFIED, 'container') +gm.mark_processed('example.py', 'Test file', []) + +# Export +export_graph(gm, 'test.json', format='json') +print("โœ… Exported to test.json") + +# Load back +loaded = load_graph_from_json('test.json') +print(f"โœ… Loaded {len(loaded.file_nodes)} files, {len(loaded.component_nodes)} components") +EOF +``` + +### Scenario 2: Test All Export Formats + +```bash +python << 'EOF' +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.graph_export import export_graph + +gm = GraphManager() +gm.add_file('test.py', ChangeType.ADDED) +gm.mark_processed('test.py', 'New file', []) + +# Export in all formats +export_graph(gm, 'output.json', format='json') +export_graph(gm, 'output.pkl', format='pickle') +export_graph(gm, 'output.graphml', format='graphml') + +print("โœ… Generated: output.json, output.pkl, output.graphml") +EOF + +# Check the files +ls -lh output.* +``` + +### Scenario 3: Test CLI Options + +```bash +# View help to see new options +python -m diffgraph.cli --help + +# Test format option +python -m diffgraph.cli diff --format graph --help + +# Test graph-format option +python -m diffgraph.cli diff --format graph --graph-format json --help +``` + +### Scenario 4: Test with Current Git Changes + +If you have uncommitted changes in your repo: + +```bash +# Export current changes as graph data +python -m diffgraph.cli diff --format graph --output my-changes.json + +# View the exported data +cat my-changes.json | python -m json.tool | less + +# Analyze it +python example_usage.py my-changes.json +``` + +### Scenario 5: Test Data Integrity + +```bash +python << 'EOF' +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.graph_export import export_graph, load_graph_from_json +import json + +# Create and export +gm = GraphManager() +gm.add_file('test.py', ChangeType.MODIFIED) +gm.add_component('TestFunc', 'test.py', ChangeType.ADDED, 'function', + summary='A test function', dependencies=['other_func']) +gm.mark_processed('test.py', 'Modified test file', []) + +export_graph(gm, 'integrity_test.json', format='json') + +# Load and verify +loaded = load_graph_from_json('integrity_test.json') +comp = loaded.component_nodes['test.py::TestFunc'] + +assert comp.name == 'TestFunc' +assert comp.summary == 'A test function' +assert comp.dependencies == ['other_func'] +assert loaded.file_nodes['test.py'].summary == 'Modified test file' + +print("โœ… Data integrity verified!") +print(f" Component: {comp.name}") +print(f" Summary: {comp.summary}") +print(f" Dependencies: {comp.dependencies}") +EOF +``` + +## Verifying the Feature Works + +### Checklist + +- [ ] CLI accepts `--format` option +- [ ] CLI accepts `--graph-format` option +- [ ] JSON export works +- [ ] Pickle export works +- [ ] GraphML export works +- [ ] Loading from JSON works +- [ ] Loading from pickle works +- [ ] Data integrity is preserved +- [ ] Example script works +- [ ] HTML output still works (backward compatibility) +- [ ] Help text shows new options + +### Quick Verification Commands + +```bash +# 1. Check CLI has new options +python -m diffgraph.cli --help | grep -E "format" + +# 2. Run unit tests +python test_graph_export.py + +# 3. Run full test suite +./test_cli_manual.sh + +# 4. Test example usage +python example_usage.py test_output.json +``` + +## Troubleshooting + +### Import Error: No module named 'networkx' + +```bash +pip install -r requirements.txt +``` + +### Import Error: No module named 'diffgraph' + +```bash +# Install in development mode +pip install -e . +``` + +### API Key Error + +For testing without an API key, use the unit tests or create sample data manually: + +```bash +# These don't require API keys: +python test_graph_export.py +./test_cli_manual.sh +``` + +For testing with real changes, you need an OpenAI API key: + +```bash +export OPENAI_API_KEY="your-key-here" +# or add to .env file +``` + +## Expected Results + +### JSON Output Structure + +```json +{ + "version": "1.0", + "file_nodes": { + "file.py": { + "path": "file.py", + "status": "processed", + "change_type": "modified", + "summary": "...", + "error": null, + "components": [...] + } + }, + "component_nodes": { + "file.py::ComponentName": { + "name": "ComponentName", + "file_path": "file.py", + "change_type": "modified", + "component_type": "container", + "parent": null, + "summary": "...", + "dependencies": [...], + "dependents": [...] + } + }, + "file_graph": {...}, + "component_graph": {...}, + "processed_files": [...] +} +``` + +### File Sizes + +Typical sizes for test data: +- JSON: ~2KB (human-readable) +- Pickle: ~800B (binary, compact) +- GraphML: ~3KB (XML format) + +## Performance Testing + +To test with larger datasets: + +```bash +python << 'EOF' +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.graph_export import export_graph +import time + +# Create larger dataset +gm = GraphManager() +for i in range(100): + gm.add_file(f'file_{i}.py', ChangeType.MODIFIED) + gm.add_component(f'Class_{i}', f'file_{i}.py', ChangeType.MODIFIED, 'container') + gm.mark_processed(f'file_{i}.py', f'File {i}', []) + +# Time the export +start = time.time() +export_graph(gm, 'large_test.json', format='json') +elapsed = time.time() - start + +print(f"โœ… Exported 100 files in {elapsed:.2f} seconds") +EOF +``` + +## Next Steps After Testing + +1. โœ… All tests pass โ†’ Feature is ready to use +2. โš ๏ธ Some tests fail โ†’ Check error messages and file issues +3. ๐ŸŽ‰ Tests successful โ†’ Try with your real project changes! + +```bash +# Use it in your project +cd /path/to/your/project +wild diff --format graph --output analysis.json +python /path/to/example_usage.py analysis.json +``` diff --git a/diffgraph/__init__.py b/diffgraph/__init__.py index 317dee1..eb877fc 100644 --- a/diffgraph/__init__.py +++ b/diffgraph/__init__.py @@ -2,4 +2,4 @@ DiffGraph - A CLI tool for visualizing code changes with AI """ -__version__ = "0.1.0" \ No newline at end of file +__version__ = "1.1.0" \ No newline at end of file diff --git a/diffgraph/cli.py b/diffgraph/cli.py index 15e0522..18bc00e 100644 --- a/diffgraph/cli.py +++ b/diffgraph/cli.py @@ -7,6 +7,7 @@ import os from diffgraph.ai_analysis import CodeAnalysisAgent from diffgraph.html_report import generate_html_report, AnalysisResult +from diffgraph.graph_export import export_graph from diffgraph.env_loader import load_env_file, debug_environment from diffgraph.utils import sanitize_diff_args, involves_working_tree @@ -133,11 +134,21 @@ def load_file_contents(changed_files: List[Dict[str, str]], diff_args: List[str] @click.version_option(package_name='wild') @click.argument('args', nargs=-1, type=click.UNPROCESSED) @click.option('--api-key', envvar='OPENAI_API_KEY', help='OpenAI API key') -@click.option('--output', '-o', default='diffgraph.html', help='Output HTML file path') +@click.option('--output', '-o', help='Output file path (default: diffgraph.html for HTML, diffgraph.json for graph)') +@click.option('--format', '-f', type=click.Choice(['html', 'graph'], case_sensitive=False), default='html', help='Output format: html or graph (default: html)') +@click.option('--graph-format', type=click.Choice(['json', 'pickle', 'graphml'], case_sensitive=False), default='json', help='Graph serialization format when using --format graph (default: json)') @click.option('--no-open', is_flag=True, help='Do not open the HTML report automatically') @click.option('--debug-env', is_flag=True, help='Debug environment variable loading') -def main(args, api_key: str, output: str, no_open: bool, debug_env: bool): +def main(args, api_key: str, output: str, format: str, graph_format: str, no_open: bool, debug_env: bool): """wild - Git wrapper CLI with DiffGraph for diff commands.""" + + # Set default output path based on format if not specified + if not output: + if format == 'graph': + extension_map = {'json': '.json', 'pickle': '.pkl', 'graphml': '.graphml'} + output = f'diffgraph{extension_map.get(graph_format, ".json")}' + else: + output = 'diffgraph.html' # Check if this is a diff command if args and args[0] == 'diff': @@ -195,27 +206,34 @@ def progress_callback(current_file, total_files, status): click.echo("๐Ÿง  Starting code analysis...") analysis = agent.analyze_changes(files_with_content, progress_callback) - # Create analysis result - click.echo("๐Ÿ“Š Creating analysis result...") - analysis_result = AnalysisResult( - summary=analysis.summary, - mermaid_diagram=analysis.mermaid_diagram - ) + # Generate output based on format + if format == 'graph': + # Export graph data + click.echo(f"๐Ÿ’พ Exporting graph data in {graph_format} format...") + graph_path = export_graph(agent.graph_manager, output, graph_format) + click.echo(f"โœ… Graph data exported: {graph_path}") + else: + # Create analysis result + click.echo("๐Ÿ“Š Creating analysis result...") + analysis_result = AnalysisResult( + summary=analysis.summary, + mermaid_diagram=analysis.mermaid_diagram + ) - # Generate HTML report - click.echo("๐Ÿ–จ๏ธ Generating HTML report...") - html_path = generate_html_report(analysis_result, output) - click.echo(f"โœ… HTML report generated: {html_path}") - - # Open the HTML report in the default browser - if not no_open: - click.echo("๐ŸŒ Opening report in browser...") - if sys.platform == 'darwin': # macOS - subprocess.run(['open', html_path]) - elif sys.platform == 'win32': # Windows - os.startfile(html_path) - else: # Linux - subprocess.run(['xdg-open', html_path]) + # Generate HTML report + click.echo("๐Ÿ–จ๏ธ Generating HTML report...") + html_path = generate_html_report(analysis_result, output) + click.echo(f"โœ… HTML report generated: {html_path}") + + # Open the HTML report in the default browser + if not no_open: + click.echo("๐ŸŒ Opening report in browser...") + if sys.platform == 'darwin': # macOS + subprocess.run(['open', html_path]) + elif sys.platform == 'win32': # Windows + os.startfile(html_path) + else: # Linux + subprocess.run(['xdg-open', html_path]) except ValueError as e: click.echo(f"โŒ Error: {e}", err=True) diff --git a/diffgraph/graph_export.py b/diffgraph/graph_export.py new file mode 100644 index 0000000..9ded422 --- /dev/null +++ b/diffgraph/graph_export.py @@ -0,0 +1,273 @@ +import json +import pickle +from pathlib import Path +from typing import Dict, Any, Optional +import networkx as nx +from networkx.readwrite import json_graph +from .graph_manager import GraphManager, FileNode, ComponentNode, ChangeType, FileStatus + + +def export_graph_to_json(graph_manager: GraphManager, output_path: str) -> str: + """ + Export graph data to JSON format. + + Args: + graph_manager: GraphManager instance containing the graphs + output_path: Path where the JSON file should be saved + + Returns: + Path to the generated JSON file + """ + # Prepare the data structure + data = _prepare_graph_data(graph_manager) + + # Write to JSON file + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + return str(Path(output_path).absolute()) + + +def export_graph_to_pickle(graph_manager: GraphManager, output_path: str) -> str: + """ + Export graph data to pickle format. + + Args: + graph_manager: GraphManager instance containing the graphs + output_path: Path where the pickle file should be saved + + Returns: + Path to the generated pickle file + """ + # Prepare the data structure + data = _prepare_graph_data(graph_manager) + + # Write to pickle file + with open(output_path, 'wb') as f: + pickle.dump(data, f) + + return str(Path(output_path).absolute()) + + +def export_graph_to_graphml(graph_manager: GraphManager, output_path: str) -> str: + """ + Export graph data to GraphML format. + + Args: + graph_manager: GraphManager instance containing the graphs + output_path: Path where the GraphML file should be saved + + Returns: + Path to the generated GraphML file + """ + # GraphML doesn't support complex objects, so we'll create simplified versions + # with serialized attributes + + # Create a combined graph for GraphML export + combined_graph = nx.DiGraph() + + # Add file nodes + for file_path, node in graph_manager.file_nodes.items(): + combined_graph.add_node( + file_path, + node_type='file', + status=node.status.value, + change_type=node.change_type.value, + summary=node.summary or '', + error=node.error or '', + components=json.dumps([c.name if hasattr(c, 'name') else str(c) for c in (node.components or [])]) + ) + + # Add component nodes + for component_id, node in graph_manager.component_nodes.items(): + combined_graph.add_node( + component_id, + node_type='component', + name=node.name, + file_path=node.file_path, + change_type=node.change_type.value, + component_type=node.component_type, + parent=node.parent or '', + summary=node.summary or '', + dependencies=json.dumps(node.dependencies), + dependents=json.dumps(node.dependents) + ) + + # Add edges from both graphs + for source, target in graph_manager.file_graph.edges(): + combined_graph.add_edge(source, target, graph_type='file') + + for source, target in graph_manager.component_graph.edges(): + combined_graph.add_edge(source, target, graph_type='component') + + # Write to GraphML file + nx.write_graphml(combined_graph, output_path) + + return str(Path(output_path).absolute()) + + +def _prepare_graph_data(graph_manager: GraphManager) -> Dict[str, Any]: + """ + Prepare graph data for serialization. + + Args: + graph_manager: GraphManager instance containing the graphs + + Returns: + Dictionary containing all graph data and metadata + """ + # Convert file nodes to serializable format + file_nodes_data = {} + for file_path, node in graph_manager.file_nodes.items(): + file_nodes_data[file_path] = { + 'path': node.path, + 'status': node.status.value, + 'change_type': node.change_type.value, + 'summary': node.summary, + 'error': node.error, + 'components': [ + { + 'name': c.name if hasattr(c, 'name') else str(c), + 'change_type': c.change_type if hasattr(c, 'change_type') else 'unknown', + 'summary': c.summary if hasattr(c, 'summary') else None + } if hasattr(c, '__dict__') else str(c) + for c in (node.components or []) + ] + } + + # Convert component nodes to serializable format + component_nodes_data = {} + for component_id, node in graph_manager.component_nodes.items(): + component_nodes_data[component_id] = { + 'name': node.name, + 'file_path': node.file_path, + 'change_type': node.change_type.value, + 'component_type': node.component_type, + 'parent': node.parent, + 'summary': node.summary, + 'dependencies': node.dependencies, + 'dependents': node.dependents + } + + # Convert graphs to node-link format + file_graph_data = json_graph.node_link_data(graph_manager.file_graph, edges="links") + component_graph_data = json_graph.node_link_data(graph_manager.component_graph, edges="links") + + # Combine all data + data = { + 'version': '1.0', + 'file_nodes': file_nodes_data, + 'component_nodes': component_nodes_data, + 'file_graph': file_graph_data, + 'component_graph': component_graph_data, + 'processed_files': list(graph_manager.processed_files) + } + + return data + + +def load_graph_from_json(json_path: str) -> GraphManager: + """ + Load graph data from a JSON file. + + Args: + json_path: Path to the JSON file + + Returns: + GraphManager instance with loaded data + """ + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + return _reconstruct_graph_manager(data) + + +def load_graph_from_pickle(pickle_path: str) -> GraphManager: + """ + Load graph data from a pickle file. + + Args: + pickle_path: Path to the pickle file + + Returns: + GraphManager instance with loaded data + """ + with open(pickle_path, 'rb') as f: + data = pickle.load(f) + + return _reconstruct_graph_manager(data) + + +def _reconstruct_graph_manager(data: Dict[str, Any]) -> GraphManager: + """ + Reconstruct a GraphManager instance from serialized data. + + Args: + data: Dictionary containing serialized graph data + + Returns: + GraphManager instance with reconstructed data + """ + graph_manager = GraphManager() + + # Reconstruct file nodes + for file_path, node_data in data['file_nodes'].items(): + file_node = FileNode( + path=node_data['path'], + status=FileStatus(node_data['status']), + change_type=ChangeType(node_data['change_type']), + summary=node_data.get('summary'), + error=node_data.get('error'), + components=node_data.get('components', []) + ) + graph_manager.file_nodes[file_path] = file_node + + # Reconstruct component nodes + for component_id, node_data in data['component_nodes'].items(): + component_node = ComponentNode( + name=node_data['name'], + file_path=node_data['file_path'], + change_type=ChangeType(node_data['change_type']), + component_type=node_data['component_type'], + parent=node_data.get('parent'), + summary=node_data.get('summary'), + dependencies=node_data.get('dependencies', []), + dependents=node_data.get('dependents', []) + ) + graph_manager.component_nodes[component_id] = component_node + + # Reconstruct graphs + graph_manager.file_graph = json_graph.node_link_graph(data['file_graph'], directed=True, edges="links") + graph_manager.component_graph = json_graph.node_link_graph(data['component_graph'], directed=True, edges="links") + + # Reconstruct processed files set + graph_manager.processed_files = set(data.get('processed_files', [])) + + return graph_manager + + +def export_graph(graph_manager: GraphManager, output_path: str, format: str = 'json') -> str: + """ + Export graph data in the specified format. + + Args: + graph_manager: GraphManager instance containing the graphs + output_path: Path where the file should be saved + format: Export format ('json', 'pickle', or 'graphml') + + Returns: + Path to the generated file + + Raises: + ValueError: If format is not supported + """ + format = format.lower() + + if format == 'json': + return export_graph_to_json(graph_manager, output_path) + elif format == 'pickle': + return export_graph_to_pickle(graph_manager, output_path) + elif format == 'graphml': + return export_graph_to_graphml(graph_manager, output_path) + else: + raise ValueError(f"Unsupported format: {format}. Supported formats: json, pickle, graphml") diff --git a/diffgraph/graph_manager.py b/diffgraph/graph_manager.py index e257092..d07d464 100644 --- a/diffgraph/graph_manager.py +++ b/diffgraph/graph_manager.py @@ -300,4 +300,61 @@ def get_mermaid_diagram(self) -> str: mermaid.append(" classDef hidden fill:none,stroke:none") - return "\n".join(mermaid) \ No newline at end of file + return "\n".join(mermaid) + + def export_to_dict(self) -> dict: + """ + Export the graph manager state to a dictionary. + This is useful for serialization and can be passed to graph_export functions. + + Returns: + Dictionary containing all graph data and metadata + """ + from networkx.readwrite import json_graph + + # Convert file nodes to serializable format + file_nodes_data = {} + for file_path, node in self.file_nodes.items(): + file_nodes_data[file_path] = { + 'path': node.path, + 'status': node.status.value, + 'change_type': node.change_type.value, + 'summary': node.summary, + 'error': node.error, + 'components': [ + { + 'name': c.name if hasattr(c, 'name') else str(c), + 'change_type': c.change_type if hasattr(c, 'change_type') else 'unknown', + 'summary': c.summary if hasattr(c, 'summary') else None + } if hasattr(c, '__dict__') else str(c) + for c in (node.components or []) + ] + } + + # Convert component nodes to serializable format + component_nodes_data = {} + for component_id, node in self.component_nodes.items(): + component_nodes_data[component_id] = { + 'name': node.name, + 'file_path': node.file_path, + 'change_type': node.change_type.value, + 'component_type': node.component_type, + 'parent': node.parent, + 'summary': node.summary, + 'dependencies': node.dependencies, + 'dependents': node.dependents + } + + # Convert graphs to node-link format + file_graph_data = json_graph.node_link_data(self.file_graph, edges="links") + component_graph_data = json_graph.node_link_data(self.component_graph, edges="links") + + # Combine all data + return { + 'version': '1.0', + 'file_nodes': file_nodes_data, + 'component_nodes': component_nodes_data, + 'file_graph': file_graph_data, + 'component_graph': component_graph_data, + 'processed_files': list(self.processed_files) + } \ No newline at end of file diff --git a/example_usage.py b/example_usage.py new file mode 100644 index 0000000..39dab4c --- /dev/null +++ b/example_usage.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" +Example script showing how to load and use exported graph data. + +This demonstrates how another program can read the exported graph data +and access all the analysis information. +""" + +from diffgraph.graph_export import load_graph_from_json +import sys + + +def analyze_exported_graph(json_path: str): + """ + Load and analyze an exported graph file. + + Args: + json_path: Path to the exported JSON file + """ + print(f"Loading graph data from: {json_path}") + print("-" * 60) + + # Load the graph + graph_manager = load_graph_from_json(json_path) + + # Summary statistics + print("\n๐Ÿ“Š Graph Statistics:") + print(f" Total files: {len(graph_manager.file_nodes)}") + print(f" Total components: {len(graph_manager.component_nodes)}") + print(f" File dependencies: {graph_manager.file_graph.number_of_edges()}") + print(f" Component dependencies: {graph_manager.component_graph.number_of_edges()}") + print(f" Processed files: {len(graph_manager.processed_files)}") + + # Analyze files + print("\n๐Ÿ“ File Analysis:") + for file_path, file_node in graph_manager.file_nodes.items(): + print(f"\n {file_path}") + print(f" Status: {file_node.status.value}") + print(f" Change Type: {file_node.change_type.value}") + if file_node.summary: + print(f" Summary: {file_node.summary[:80]}...") + if file_node.error: + print(f" Error: {file_node.error}") + + # Analyze components + print("\n๐Ÿ”ง Component Analysis:") + for component_id, component_node in graph_manager.component_nodes.items(): + print(f"\n {component_node.name} ({component_node.component_type})") + print(f" File: {component_node.file_path}") + print(f" Change Type: {component_node.change_type.value}") + if component_node.parent: + print(f" Parent: {component_node.parent}") + if component_node.dependencies: + print(f" Dependencies: {', '.join(component_node.dependencies[:3])}{'...' if len(component_node.dependencies) > 3 else ''}") + if component_node.dependents: + print(f" Dependents: {', '.join(component_node.dependents[:3])}{'...' if len(component_node.dependents) > 3 else ''}") + if component_node.summary: + print(f" Summary: {component_node.summary[:80]}...") + + # Find most connected components + if graph_manager.component_nodes: + print("\n๐ŸŒŸ Most Connected Components:") + component_connections = [] + for component_id, component_node in graph_manager.component_nodes.items(): + total_connections = len(component_node.dependencies) + len(component_node.dependents) + if total_connections > 0: + component_connections.append((component_node.name, total_connections)) + + component_connections.sort(key=lambda x: x[1], reverse=True) + for name, count in component_connections[:5]: + print(f" {name}: {count} connections") + + print("\n" + "=" * 60) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python example_usage.py ") + print("\nExample:") + print(" # First, export graph data from your changes") + print(" wild diff --format graph --output my-changes.json") + print() + print(" # Then analyze the exported data") + print(" python example_usage.py my-changes.json") + sys.exit(1) + + json_path = sys.argv[1] + analyze_exported_graph(json_path) diff --git a/setup.py b/setup.py index 33e623f..5624d69 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="wild", - version="1.0.0", + version="1.1.0", packages=find_packages(), install_requires=[ "click>=8.1.7", diff --git a/test_cli_manual.sh b/test_cli_manual.sh new file mode 100755 index 0000000..1c90314 --- /dev/null +++ b/test_cli_manual.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Manual CLI testing script for the new graph export feature + +set -e # Exit on error + +echo "๐Ÿงช DiffGraph CLI Test Suite" +echo "================================" +echo "" + +# Check if we're in a git repo +if ! git rev-parse --git-dir > /dev/null 2>&1; then + echo "โŒ Not in a git repository" + exit 1 +fi + +echo "โœ… Git repository detected" +echo "" + +# Check if dependencies are installed +echo "๐Ÿ“ฆ Checking dependencies..." +python -c "import click, networkx" 2>/dev/null || { + echo "โš ๏ธ Installing dependencies..." + pip install -q -r requirements.txt +} +echo "โœ… Dependencies OK" +echo "" + +# Test 1: Create sample data without API +echo "Test 1: Testing graph export with sample data (no API needed)" +echo "--------------------------------------------------------------" +python << 'PYEOF' +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.graph_export import export_graph + +# Create test graph +gm = GraphManager() +gm.add_file('app/main.py', ChangeType.MODIFIED) +gm.add_file('app/utils.py', ChangeType.ADDED) +gm.add_component('Application', 'app/main.py', ChangeType.MODIFIED, 'container', summary='Main app class') +gm.add_component('run', 'app/main.py', ChangeType.MODIFIED, 'method', parent='Application', summary='Runs the app') +gm.add_component('helper', 'app/utils.py', ChangeType.ADDED, 'function', summary='Helper function') +gm.add_component_dependency('app/main.py::run', 'app/utils.py::helper') +gm.mark_processed('app/main.py', 'Modified main file', []) +gm.mark_processed('app/utils.py', 'Added utils', []) + +# Export in all formats +export_graph(gm, 'test_output.json', format='json') +export_graph(gm, 'test_output.pkl', format='pickle') +export_graph(gm, 'test_output.graphml', format='graphml') + +print('โœ… Generated test files:') +print(' - test_output.json') +print(' - test_output.pkl') +print(' - test_output.graphml') +PYEOF +echo "" + +# Test 2: Verify JSON structure +echo "Test 2: Verifying JSON structure" +echo "----------------------------------" +python << 'PYEOF' +import json + +with open('test_output.json', 'r') as f: + data = json.load(f) + +assert 'version' in data, "Missing version field" +assert 'file_nodes' in data, "Missing file_nodes" +assert 'component_nodes' in data, "Missing component_nodes" +assert len(data['file_nodes']) == 2, f"Expected 2 files, got {len(data['file_nodes'])}" +assert len(data['component_nodes']) == 3, f"Expected 3 components, got {len(data['component_nodes'])}" + +print('โœ… JSON structure is valid') +print(f' - Files: {len(data["file_nodes"])}') +print(f' - Components: {len(data["component_nodes"])}') +print(f' - Version: {data["version"]}') +PYEOF +echo "" + +# Test 3: Load and verify data integrity +echo "Test 3: Testing data load/export round-trip" +echo "--------------------------------------------" +python << 'PYEOF' +from diffgraph.graph_export import load_graph_from_json, load_graph_from_pickle + +# Load from JSON +gm_json = load_graph_from_json('test_output.json') +print(f'โœ… Loaded from JSON: {len(gm_json.file_nodes)} files, {len(gm_json.component_nodes)} components') + +# Load from pickle +gm_pickle = load_graph_from_pickle('test_output.pkl') +print(f'โœ… Loaded from pickle: {len(gm_pickle.file_nodes)} files, {len(gm_pickle.component_nodes)} components') + +# Verify they match +assert len(gm_json.file_nodes) == len(gm_pickle.file_nodes), "File count mismatch" +assert len(gm_json.component_nodes) == len(gm_pickle.component_nodes), "Component count mismatch" +print('โœ… Data integrity verified across formats') +PYEOF +echo "" + +# Test 4: Test example usage script +echo "Test 4: Testing example usage script" +echo "-------------------------------------" +python example_usage.py test_output.json | head -30 +echo "" +echo "โœ… Example script works correctly" +echo "" + +# Test 5: Test CLI help +echo "Test 5: Verifying CLI options" +echo "------------------------------" +python -m diffgraph.cli --help | grep -E "(--format|--graph-format)" && echo "โœ… New CLI options are present" || echo "โŒ CLI options missing" +echo "" + +# Test 6: Check file sizes +echo "Test 6: Checking output file sizes" +echo "-----------------------------------" +ls -lh test_output.* | awk '{print " " $9 ": " $5}' +echo "" + +# Cleanup +echo "๐Ÿงน Cleanup" +echo "----------" +read -p "Remove test files? (y/N) " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -f test_output.json test_output.pkl test_output.graphml + echo "โœ… Test files removed" +else + echo "โ„น๏ธ Test files kept for inspection" +fi +echo "" + +echo "================================" +echo "โœ… All tests passed!" +echo "" +echo "Next steps:" +echo " 1. To test with real code changes, run:" +echo " python -m diffgraph.cli diff --format graph --output my-analysis.json" +echo "" +echo " 2. To analyze exported data:" +echo " python example_usage.py my-analysis.json" +echo "" +echo " 3. To generate HTML (default behavior):" +echo " python -m diffgraph.cli diff --output report.html" diff --git a/test_graph_export.py b/test_graph_export.py new file mode 100644 index 0000000..b1650a4 --- /dev/null +++ b/test_graph_export.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +""" +Simple test script to verify graph export functionality. +This creates a minimal graph and tests export/import in all formats. +""" + +from diffgraph.graph_manager import GraphManager, ChangeType, FileStatus +from diffgraph.graph_export import export_graph, load_graph_from_json, load_graph_from_pickle +import os +import json + + +def test_graph_export(): + """Test graph export functionality.""" + print("Creating test graph...") + + # Create a test graph manager + gm = GraphManager() + + # Add some test data + gm.add_file("test_file1.py", ChangeType.MODIFIED) + gm.add_file("test_file2.py", ChangeType.ADDED) + + gm.add_component("TestClass", "test_file1.py", ChangeType.MODIFIED, "container") + gm.add_component("test_function", "test_file1.py", ChangeType.ADDED, "function") + gm.add_component("test_method", "test_file1.py", ChangeType.MODIFIED, "method", + parent="TestClass", summary="A test method") + + gm.add_component("NewClass", "test_file2.py", ChangeType.ADDED, "container", + summary="A new class") + + # Add a dependency + gm.add_component_dependency("test_file1.py::test_function", "test_file1.py::TestClass") + + # Mark files as processed + gm.mark_processed("test_file1.py", "Modified test file", []) + gm.mark_processed("test_file2.py", "New test file", []) + + print(f" Files in graph: {len(gm.file_nodes)}") + print(f" Components in graph: {len(gm.component_nodes)}") + print(f" File edges: {gm.file_graph.number_of_edges()}") + print(f" Component edges: {gm.component_graph.number_of_edges()}") + + # Test JSON export + print("\nTesting JSON export...") + json_path = "test_output.json" + result_path = export_graph(gm, json_path, format='json') + print(f" Exported to: {result_path}") + + # Verify JSON file exists and is valid + assert os.path.exists(json_path), "JSON file was not created" + with open(json_path, 'r') as f: + data = json.load(f) + assert 'version' in data, "JSON missing version" + assert 'file_nodes' in data, "JSON missing file_nodes" + assert 'component_nodes' in data, "JSON missing component_nodes" + print(f" JSON contains {len(data['file_nodes'])} files and {len(data['component_nodes'])} components") + + # Test loading from JSON + print("\nTesting JSON import...") + loaded_gm = load_graph_from_json(json_path) + print(f" Loaded {len(loaded_gm.file_nodes)} files") + print(f" Loaded {len(loaded_gm.component_nodes)} components") + assert len(loaded_gm.file_nodes) == len(gm.file_nodes), "File count mismatch" + assert len(loaded_gm.component_nodes) == len(gm.component_nodes), "Component count mismatch" + + # Test pickle export + print("\nTesting pickle export...") + pickle_path = "test_output.pkl" + result_path = export_graph(gm, pickle_path, format='pickle') + print(f" Exported to: {result_path}") + assert os.path.exists(pickle_path), "Pickle file was not created" + + # Test loading from pickle + print("\nTesting pickle import...") + loaded_gm_pickle = load_graph_from_pickle(pickle_path) + print(f" Loaded {len(loaded_gm_pickle.file_nodes)} files") + print(f" Loaded {len(loaded_gm_pickle.component_nodes)} components") + assert len(loaded_gm_pickle.file_nodes) == len(gm.file_nodes), "File count mismatch" + assert len(loaded_gm_pickle.component_nodes) == len(gm.component_nodes), "Component count mismatch" + + # Test GraphML export + print("\nTesting GraphML export...") + graphml_path = "test_output.graphml" + result_path = export_graph(gm, graphml_path, format='graphml') + print(f" Exported to: {result_path}") + assert os.path.exists(graphml_path), "GraphML file was not created" + + # Verify GraphML is valid XML + with open(graphml_path, 'r') as f: + content = f.read() + assert ' Date: Fri, 24 Oct 2025 23:54:51 -0700 Subject: [PATCH 2/4] docs: Add structured output format design documentation Add comprehensive design documents for new integration-friendly JSON output format to docs/planning/ folder to keep root directory clean. Documents: - STRUCTURED_OUTPUT_DESIGN.md: Complete schema specification - File categorization (source/docs/config/auto-generated) - Rich component metadata (complexity, impact radius) - External dependency nodes (APIs, databases, services) - Comprehensive relationship types (REST, RPC, pub/sub) - Cross-references between docs and code - Phase-based implementation strategy - PHASE1_IMPLEMENTATION_NOTES.md: Implementation guide - Implementation decisions and rationale - Phase 1 scope (basic restructuring with existing data) - Phase 2/3 future enhancements - Testing strategy and success criteria - Next session pickup instructions These documents preserve design rationale and guide future implementation without cluttering the codebase root. --- docs/planning/PHASE1_IMPLEMENTATION_NOTES.md | 314 +++++++++ docs/planning/STRUCTURED_OUTPUT_DESIGN.md | 643 +++++++++++++++++++ 2 files changed, 957 insertions(+) create mode 100644 docs/planning/PHASE1_IMPLEMENTATION_NOTES.md create mode 100644 docs/planning/STRUCTURED_OUTPUT_DESIGN.md diff --git a/docs/planning/PHASE1_IMPLEMENTATION_NOTES.md b/docs/planning/PHASE1_IMPLEMENTATION_NOTES.md new file mode 100644 index 0000000..ea13225 --- /dev/null +++ b/docs/planning/PHASE1_IMPLEMENTATION_NOTES.md @@ -0,0 +1,314 @@ +# Phase 1 Implementation Notes + +**Goal**: Ship a working version with restructured output using only existing data. + +## Implementation Questions - Answered + +### 1. Impact Radius Calculation + +**Question**: Should this be calculated from: +- Just the diff graph? (only changed components) +- The full codebase graph? (would require analyzing unchanged files too) + +**Answer for Phase 1**: Calculate from the diff graph only. + +**Implementation**: +```python +# Simple calculation from existing data +impact_radius = len(component.dependencies) + len(component.dependents) +``` + +**For Phase 2**: Analyze full codebase to get true impact radius including transitive dependencies. + +### 2. Relationship Detection + +**Question**: For detecting REST/RPC/pub-sub patterns, should we: +- Use AI to identify them from code context? +- Use pattern matching (e.g., `requests.post()`, `grpc.call()`)? +- Hybrid approach? + +**Answer for Phase 1**: Keep it simple - use generic relationship types from existing graph. + +**Implementation**: +- Map existing edges to basic relationships: `imports`, `calls`, `extends`, `implements` +- Don't detect REST/RPC/pub-sub yet +- Use `calls` as the generic relationship for most function calls + +**For Phase 2**: Add pattern matching and AI detection for specialized relationships. + +### 3. Auto-Generated Patterns + +**Question**: Should I create a configurable patterns file? + +**Answer**: Yes, but keep it minimal for Phase 1. + +**Implementation**: +```python +# Hardcode in the module for now +AUTO_GENERATED_PATTERNS = [ + '*-lock.json', '*.lock', + '*.min.js', '*.min.css', + 'dist/*', 'build/*', 'target/*', + '__pycache__/*', '*.pyc' +] + +DOC_PATTERNS = [ + '*.md', '*.rst', '*.adoc', + 'docs/*', 'documentation/*' +] + +CONFIG_PATTERNS = [ + '*.toml', '*.yaml', '*.yml', + 'setup.py', 'setup.cfg', 'pyproject.toml', + '.*rc', '.*ignore', 'Makefile', 'Dockerfile', + 'package.json', 'tsconfig.json' +] +``` + +**For Phase 2**: Move to external config file, add more patterns, use AI for edge cases. + +### 4. AI Prompts + +**Question**: Should I keep backward compatibility or completely replace? + +**Answer**: Completely replace for JSON format. Keep NetworkX format for pickle/graphml. + +**Implementation**: +- When `--graph-format json`: Use new structured format +- When `--graph-format pickle` or `graphml`: Use current NetworkX format +- No backward compatibility needed (hasn't shipped yet) + +## What to Implement in Phase 1 + +### Module Structure + +Create new module: `diffgraph/structured_export.py` + +```python +# diffgraph/structured_export.py + +def export_structured_json(graph_manager: GraphManager, + output_path: str, + diff_args: List[str]) -> str: + """ + Export graph in structured JSON format. + + Phase 1: Uses existing data, leaves some fields null. + """ + pass + +def classify_file(file_path: str) -> str: + """Classify file as: auto_generated, documentation, configuration, or source_code""" + pass + +def get_file_stats(file_path: str, diff_args: List[str]) -> Dict: + """Get git diff stats (additions/deletions) for a file""" + pass + +def transform_to_structured_format(graph_manager: GraphManager, + diff_args: List[str]) -> dict: + """Transform NetworkX graph data to structured format""" + pass +``` + +### CLI Changes + +Update `diffgraph/cli.py`: + +```python +# When format is 'graph' and graph_format is 'json' +if format == 'graph': + if graph_format == 'json': + # Use new structured format + from diffgraph.structured_export import export_structured_json + graph_path = export_structured_json(agent.graph_manager, output, diff_args) + else: + # Use existing NetworkX format for pickle/graphml + graph_path = export_graph(agent.graph_manager, output, graph_format) +``` + +### Data Extraction + +#### From Existing GraphManager + +Available data: +- `graph_manager.file_nodes` โ†’ File information +- `graph_manager.component_nodes` โ†’ Component information +- `graph_manager.file_graph` โ†’ File dependencies +- `graph_manager.component_graph` โ†’ Component dependencies +- `graph_manager.processed_files` โ†’ Which files completed analysis + +Already have: +- Component names, types, summaries +- Parent relationships (for nested components) +- Change types (added, deleted, modified) +- Dependencies and dependents lists + +#### From Git + +Need to extract: +```bash +# File-level stats +git diff --numstat [diff_args] -- +# Output: additions\tdeletions\tfilename + +# Component-level line numbers (harder - skip for Phase 1) +# Leave as null for now +``` + +### Fields to Leave Null/Empty in Phase 1 + +**Component Nodes**: +- `complexity`: null (needs cyclomatic complexity analysis) +- `old_line_number`: null (needs git diff parsing) +- `new_line_number`: null (needs git diff parsing) +- `parameters`: null (needs signature parsing) +- `return_type`: null (needs signature parsing) + +**Per-component Stats**: +- `additions`: null (needs per-function diff analysis) +- `deletions`: null (needs per-function diff analysis) + +**Documentation**: +- `sections_modified`: empty array (needs doc parsing) +- `cross_references`: empty array (needs NLP) + +**Configuration**: +- `config_changes`: empty array (needs structured parsing) +- `cross_references`: empty array (needs analysis) + +**External Nodes**: +- Don't create for Phase 1 (or create with minimal info if edge targets them) + +**Advanced Relationships**: +- Only use: `imports`, `calls`, `extends`, `implements` +- No REST/RPC/pub-sub/database for Phase 1 + +### Simple Transformation Logic + +```python +def transform_component_to_structured(component_id: str, + component: ComponentNode, + graph_manager: GraphManager) -> dict: + # Simple impact radius from existing dependency lists + impact_radius = len(component.dependencies) + len(component.dependents) + + return { + 'id': component_id, + 'parent_id': component.parent, + 'component_type': component.component_type, + 'name': component.name, + 'file_path': component.file_path, + 'old_line_number': None, # Phase 2 + 'new_line_number': None, # Phase 2 + 'change_type': component.change_type.value, + 'additions': None, # Phase 2 + 'deletions': None, # Phase 2 + 'summary': component.summary, + 'complexity': None, # Phase 2 + 'impact_radius': impact_radius, + 'parameters': None, # Phase 2 + 'return_type': None # Phase 2 + } + +def transform_edge_to_structured(source: str, + target: str, + graph_manager: GraphManager) -> dict: + # Determine relationship type from context + # For Phase 1: just use 'calls' for most things + relationship = 'calls' # Default + + # Could check if both are in same file โ†’ might be 'extends' or 'implements' + # But keep simple for Phase 1 + + return { + 'source': source, + 'target': target, + 'relationship': relationship, + 'change_type': 'added', # Simplified for Phase 1 + 'summary': '' # Empty for Phase 1, or copy from node summaries + } +``` + +## Testing for Phase 1 + +### Test Cases + +1. **Basic transformation**: Simple diff with 2-3 files +2. **File classification**: Mix of .py, .md, .json, package-lock.json +3. **Component hierarchy**: Nested components (class โ†’ methods) +4. **Edge transformation**: Various relationship types +5. **Deletions**: Removed files and components +6. **Git stats**: Verify additions/deletions extracted correctly + +### Validation + +- JSON schema validation +- All edge targets exist in nodes +- No duplicate node IDs +- Consistent change_types +- Null fields are actually null (not missing) + +## Success Criteria for Phase 1 + +- โœ… Restructured JSON output generated +- โœ… File classification works (basic patterns) +- โœ… All existing data preserved +- โœ… Graph completeness (edges โ†’ nodes) +- โœ… Git stats extracted for files +- โœ… Tests pass +- โœ… Documentation updated + +**Not Required**: +- โŒ Complete metadata (complexity, line numbers, etc.) +- โŒ External dependency nodes +- โŒ Advanced relationship types +- โŒ Cross-references +- โŒ Full codebase impact analysis + +## Timeline + +**Phase 1**: ~2-3 hours implementation + testing +- Create structured_export.py +- Update CLI +- Add tests +- Update docs + +**Phase 2**: Future (1-2 weeks) +- Full analysis implementation +- External dependency detection +- Advanced relationships +- Complete metadata + +## Next Conversation Pickup + +**To resume Phase 1 implementation**: +1. Read `STRUCTURED_OUTPUT_DESIGN.md` - Full schema +2. Read this file - Implementation decisions +3. Start with: "Let's implement Phase 1 structured export" +4. Create `diffgraph/structured_export.py` +5. Update `diffgraph/cli.py` to use it for JSON format + +**Key files to modify**: +- New: `diffgraph/structured_export.py` +- Update: `diffgraph/cli.py` +- Update: `diffgraph/graph_export.py` (minor) +- New: `test_structured_export.py` +- Update: `README.md`, `CHANGELOG.md` + +**Commit message template**: +``` +feat: Add structured JSON output format (Phase 1) + +Implement integration-friendly structured output format with file +classification and rich metadata. Phase 1 uses existing analysis data. + +- Add diffgraph/structured_export.py for transformation +- Update CLI to use structured format for JSON export +- Add file classification (source/docs/config/auto-generated) +- Extract git diff stats for additions/deletions +- Leave advanced fields (complexity, line numbers) for Phase 2 + +Breaking: JSON format now outputs structured format instead of NetworkX +Legacy: Use --graph-format pickle for NetworkX format +``` diff --git a/docs/planning/STRUCTURED_OUTPUT_DESIGN.md b/docs/planning/STRUCTURED_OUTPUT_DESIGN.md new file mode 100644 index 0000000..6ab5941 --- /dev/null +++ b/docs/planning/STRUCTURED_OUTPUT_DESIGN.md @@ -0,0 +1,643 @@ +# Structured Output Design Document + +**Version**: 2.0 +**Status**: Design Phase โ†’ Initial Implementation +**Created**: 2025-10-24 +**Last Updated**: 2025-10-24 + +## Overview + +This document outlines the design for a new structured JSON output format optimized for integration consumption (VSCode extensions, web UIs, CI/CD tools). Unlike the current NetworkX graph serialization, this format is specifically designed to be: + +1. **Integration-friendly**: Easy to consume without understanding NetworkX internals +2. **Semantically rich**: Clear categorization of file types and relationships +3. **Complete**: All referenced entities exist as nodes in the graph +4. **Self-documenting**: Includes metadata and context for all elements + +## Design Philosophy + +### Problems with Current Format + +1. **Internal representation exposed**: NetworkX node-link format is optimized for our processing, not consumption +2. **Processing artifacts included**: Fields like `processed_files` are internal state +3. **Requires transformation**: Consumers need to understand NetworkX and transform data +4. **Missing context**: No distinction between source code, docs, config, auto-generated files + +### New Approach + +- **Categorize files** by purpose (source code, docs, config, auto-generated) +- **Explicit graph structure** with nodes and edges arrays +- **Rich metadata** including line numbers, additions/deletions, complexity +- **Complete graph** with external dependencies as first-class nodes +- **Clear semantics** with typed relationships and change types + +## Full Schema Specification + +### Top-Level Structure + +```json +{ + "version": "2.0", + "metadata": { ... }, + "auto_generated": [ ... ], + "documentation": { ... }, + "configuration": { ... }, + "source_code": { + "files": { "nodes": [...], "edges": [...] }, + "components": { "nodes": [...], "edges": [...] } + } +} +``` + +### Metadata Section + +```json +{ + "version": "2.0", + "metadata": { + "analyzed_at": "2025-10-24T22:54:00Z", + "diff_base": "main", // git ref for base + "diff_target": "HEAD", // git ref for target + "total_files_changed": 12, + "total_additions": 1296, + "total_deletions": 28, + "analyzer_version": "1.1.0", // CLI version + "ai_model": "gpt-4o" // AI model used for analysis + } +} +``` + +### Auto-Generated Files + +Files that should not be reviewed (lock files, build artifacts, etc.) + +```json +{ + "auto_generated": [ + { + "path": "package-lock.json", + "classification_method": "pattern", // "pattern" or "ai" + "reason": "npm lock file", + "additions": 100, + "deletions": 50 + } + ] +} +``` + +**Classification Strategy**: +- **Pattern-based** (high confidence): Lock files, minified files, common build artifacts +- **AI-based** (edge cases): Framework-specific generated files, unusual patterns + +**Common Patterns** (to be hardcoded): +``` +*-lock.json, *.lock, yarn.lock, Gemfile.lock, Cargo.lock +*.min.js, *.min.css, *.bundle.js +dist/*, build/*, target/*, out/*, .next/*, __pycache__/* +*.pyc, *.class, *.o, *.so +package-lock.json, composer.lock, poetry.lock +.DS_Store, Thumbs.db +``` + +### Documentation Files + +Documentation that may reference code but isn't part of the dependency graph. + +```json +{ + "documentation": { + "README.md": { + "additions": 61, + "deletions": 5, + "summary": "Added graph export feature documentation with usage examples", + "sections_modified": ["Usage", "Output Formats", "Features"], + "cross_references": [ + { + "component_id": "diffgraph/cli.py::main", + "line_numbers": [47, 65, 71], + "context": "CLI usage examples" + } + ] + } + } +} +``` + +**Detection**: Pattern-based (*.md, docs/*, *.rst, *.adoc, etc.) + AI for edge cases + +**Cross-references**: AI can detect when docs mention specific components + +### Configuration Files + +Configuration files that affect system behavior but aren't source code. + +```json +{ + "configuration": { + "setup.py": { + "additions": 1, + "deletions": 1, + "summary": "Version bump from 1.0.0 to 1.1.0", + "config_changes": [ + { + "key": "version", + "old_value": "1.0.0", + "new_value": "1.1.0", + "line_number": 5 + } + ], + "cross_references": [] + } + } +} +``` + +**Detection**: Pattern-based (*.toml, *.yaml, *.json config files, .*rc files) + AI + +**Structured Changes**: AI can extract specific config key changes for important files + +### Source Code - Files Graph + +File-level dependency graph (import relationships). + +```json +{ + "source_code": { + "files": { + "nodes": [ + { + "path": "diffgraph/graph_export.py", + "name": "graph_export.py", + "type": "src", // "src" or "test" + "change_type": "added", // "added", "deleted", "modified", "unchanged" + "additions": 273, + "deletions": 0, + "summary": "New module for graph data export in multiple formats", + "language": "python", + "old_path": null // for renamed files + } + ], + "edges": [ + { + "source": "diffgraph/cli.py", + "target": "diffgraph/graph_export.py", + "relationship": "imports", + "change_type": "added", // "added", "deleted", "modified", "unchanged" + "summary": "CLI now imports graph_export module for data export" + } + ] + } + } +} +``` + +### Source Code - Components Graph + +Component-level dependency graph (functions, classes, methods). + +#### Component Nodes + +```json +{ + "components": { + "nodes": [ + // Regular component + { + "id": "diffgraph/graph_export.py::export_graph", + "parent_id": null, + "component_type": "function", + "name": "export_graph", + "file_path": "diffgraph/graph_export.py", + "old_line_number": null, + "new_line_number": 254, + "change_type": "added", + "additions": 15, + "deletions": 0, + "summary": "Main export function supporting JSON, pickle, and GraphML formats", + "complexity": "medium", // "low", "medium", "high" + "impact_radius": 3, // number of connected components + "parameters": ["graph_manager", "output_path", "format"], + "return_type": "str" + }, + + // Nested component (method inside class) + { + "id": "diffgraph/graph_manager.py::GraphManager::export_to_dict", + "parent_id": "diffgraph/graph_manager.py::GraphManager", + "component_type": "method", + "name": "export_to_dict", + "file_path": "diffgraph/graph_manager.py", + "old_line_number": null, + "new_line_number": 305, + "change_type": "added", + "additions": 59, + "deletions": 0, + "summary": "Exports graph manager state to dictionary for serialization", + "complexity": "medium", + "impact_radius": 2, + "parameters": [], + "return_type": "dict" + }, + + // Deleted component + { + "id": "diffgraph/old_module.py::OldClass", + "parent_id": null, + "component_type": "class", + "name": "OldClass", + "file_path": "diffgraph/old_module.py", + "old_line_number": 42, + "new_line_number": null, + "change_type": "deleted", + "additions": 0, + "deletions": 50, + "summary": "Removed deprecated class that was replaced by NewClass", + "complexity": "low", + "impact_radius": 1, + "parameters": null, + "return_type": null + }, + + // External service node + { + "id": "external::openai_api", + "parent_id": null, + "component_type": "external_service", + "name": "OpenAI API", + "file_path": null, + "old_line_number": null, + "new_line_number": null, + "change_type": "unchanged", + "additions": 0, + "deletions": 0, + "summary": "External OpenAI API service for code analysis", + "complexity": null, + "impact_radius": 5, // how many internal components use it + "parameters": null, + "return_type": null + }, + + // External API endpoint (nested under service) + { + "id": "external::openai_api::chat_completions", + "parent_id": "external::openai_api", + "component_type": "external_endpoint", + "name": "chat.completions", + "file_path": null, + "old_line_number": null, + "new_line_number": null, + "change_type": "unchanged", + "additions": 0, + "deletions": 0, + "summary": "OpenAI chat completions endpoint", + "complexity": null, + "impact_radius": 3, + "parameters": null, + "return_type": null + }, + + // Database node + { + "id": "external::postgres_db", + "parent_id": null, + "component_type": "external_database", + "name": "PostgreSQL Database", + "file_path": null, + "old_line_number": null, + "new_line_number": null, + "change_type": "unchanged", + "additions": 0, + "deletions": 0, + "summary": "Main application database", + "complexity": null, + "impact_radius": 10, + "parameters": null, + "return_type": null + }, + + // Database table (nested under database) + { + "id": "external::postgres_db::jobs_table", + "parent_id": "external::postgres_db", + "component_type": "database_table", + "name": "jobs_table", + "file_path": null, + "old_line_number": null, + "new_line_number": null, + "change_type": "modified", // schema change detected + "additions": 2, // columns added + "deletions": 0, + "summary": "Job queue table with status tracking", + "complexity": null, + "impact_radius": 5, + "parameters": null, + "return_type": null + } + ] + } +} +``` + +**Component Types**: +- **Source Code**: `class`, `function`, `method`, `interface`, `trait`, `module`, `enum`, `struct` +- **Test Code**: `test_class`, `test_function`, `test_method` +- **External**: `external_service`, `external_endpoint`, `external_database`, `database_table`, `database_collection`, `message_queue`, `cache_store` + +#### Component Edges + +```json +{ + "edges": [ + // Function call + { + "source": "diffgraph/cli.py::main", + "target": "diffgraph/graph_export.py::export_graph", + "relationship": "calls", + "change_type": "added", + "summary": "CLI conditionally calls export_graph when --format graph is specified" + }, + + // REST API call + { + "source": "api/handlers.py::process_request", + "target": "external::openai_api::chat_completions", + "relationship": "rest_api_call", + "change_type": "modified", + "summary": "Updated to use streaming API responses" + }, + + // gRPC call + { + "source": "client/grpc_client.py::call_service", + "target": "external::grpc_service::ProcessBatch", + "relationship": "rpc_call", + "change_type": "added", + "summary": "New gRPC call for batch processing" + }, + + // Database access + { + "source": "worker/processor.py::process_job", + "target": "external::postgres_db::jobs_table", + "relationship": "shared_storage", + "change_type": "modified", + "summary": "Modified job status update logic" + }, + + // Pub/Sub + { + "source": "publisher/events.py::emit_event", + "target": "subscriber/listener.py::handle_event", + "relationship": "pub_sub", + "change_type": "added", + "summary": "New event subscription for job completion notifications" + }, + + // Import + { + "source": "module_a.py::ComponentA", + "target": "module_b.py::ComponentB", + "relationship": "imports", + "change_type": "deleted", + "summary": "Removed unused import after refactoring" + }, + + // Class inheritance + { + "source": "models/user.py::AdminUser", + "target": "models/user.py::BaseUser", + "relationship": "extends", + "change_type": "unchanged", + "summary": "AdminUser inherits from BaseUser" + }, + + // Interface implementation + { + "source": "handlers/file_handler.py::S3FileHandler", + "target": "interfaces/storage.py::StorageInterface", + "relationship": "implements", + "change_type": "unchanged", + "summary": "S3FileHandler implements StorageInterface" + } + ] +} +``` + +**Relationship Types**: +- **Code**: `imports`, `calls`, `extends`, `implements`, `uses` +- **External**: `rest_api_call`, `rpc_call`, `graphql_query`, `websocket_connection` +- **Data**: `shared_storage`, `database_query`, `cache_access`, `file_io` +- **Events**: `pub_sub`, `event_emit`, `event_listen`, `webhook` + +**Change Types**: `added`, `deleted`, `modified`, `unchanged` + +## Implementation Strategy + +### Phase 1: Basic Restructuring (Current Implementation) + +**Goal**: Ship a working version that restructures existing data without new analysis. + +**What to Implement**: +1. โœ… File classification (basic patterns only, AI for edge cases) +2. โœ… Restructure existing graph data into new schema +3. โœ… Extract additions/deletions from git diff stats +4. โœ… Use existing summaries from AI analysis +5. โœ… Copy component metadata that already exists + +**What to Leave Blank** (for Phase 2): +- `complexity`: Set to `null` +- `impact_radius`: Set to `0` or calculate from diff graph only +- `parameters`, `return_type`: Set to `null` +- `cross_references` in docs/config: Empty arrays +- `config_changes` details: Basic summary only +- External nodes: Don't add yet (or add with minimal info if referenced) +- Advanced relationships (REST/RPC/pub-sub): Use generic `calls` for now + +**Data Sources for Phase 1**: +- NetworkX graphs: `graph_manager.file_graph`, `graph_manager.component_graph` +- Existing nodes: `graph_manager.file_nodes`, `graph_manager.component_nodes` +- Git diff: Use `git diff --numstat` for additions/deletions +- Existing AI summaries: Already in component/file nodes + +### Phase 2: Enhanced Analysis (Future) + +**New Analysis Required**: +1. **Complexity Calculation**: + - Cyclomatic complexity for functions/methods + - Class complexity (weighted by methods) + - AI-based complexity assessment + +2. **Impact Radius**: + - Analyze full codebase (not just diff) + - Build complete dependency graph + - Calculate transitive dependencies + - Count upstream + downstream connections + +3. **External Dependencies**: + - Detect external service calls (API, RPC, GraphQL) + - Identify database/cache/queue access + - Create external nodes in graph + - Infer parent relationships (endpoint โ†’ service, table โ†’ database) + +4. **Relationship Detection**: + - Pattern matching: `requests.post()` โ†’ REST, `grpc.call()` โ†’ RPC + - AI analysis: Identify pub/sub, event patterns + - Database query detection: ORM patterns, raw SQL + +5. **Cross-References**: + - NLP on docs to find component mentions + - Extract line numbers where components are referenced + - Link config changes to affected components + +6. **Parameter & Return Types**: + - Parse function signatures + - Use type hints when available + - AI inference for dynamic languages + +### Phase 3: Advanced Features (Future) + +- **Security Analysis**: Identify vulnerable patterns +- **Performance Impact**: Flag performance-critical changes +- **Breaking Changes**: Detect API/signature changes +- **Test Coverage**: Map tests to source components +- **Migration Paths**: Suggest refactoring strategies + +## Technical Implementation Notes + +### File Classification + +```python +# Minimal patterns for Phase 1 +AUTO_GENERATED_PATTERNS = [ + '*-lock.json', '*.lock', '*.min.js', '*.min.css', + 'dist/*', 'build/*', '__pycache__/*', '*.pyc' +] + +DOC_PATTERNS = [ + '*.md', '*.rst', '*.adoc', 'docs/*', 'documentation/*' +] + +CONFIG_PATTERNS = [ + '*.toml', '*.yaml', '*.yml', 'setup.py', 'setup.cfg', + '.*rc', '.*ignore', 'Makefile', 'Dockerfile' +] + +def classify_file(path: str) -> str: + """Returns: 'auto_generated', 'documentation', 'configuration', or 'source_code'""" + # Check patterns first + if matches_patterns(path, AUTO_GENERATED_PATTERNS): + return 'auto_generated' + if matches_patterns(path, DOC_PATTERNS): + return 'documentation' + if matches_patterns(path, CONFIG_PATTERNS): + return 'configuration' + + # Default to source code + return 'source_code' +``` + +### Extracting Git Stats + +```python +def get_diff_stats(file_path: str, diff_args: List[str]) -> Dict[str, int]: + """Get additions/deletions for a file using git diff --numstat""" + cmd = ['git', 'diff', '--numstat'] + diff_args + ['--', file_path] + result = subprocess.run(cmd, capture_output=True, text=True) + + # Output format: "additions\tdeletions\tfilename" + if result.stdout: + parts = result.stdout.strip().split('\t') + return { + 'additions': int(parts[0]) if parts[0] != '-' else 0, + 'deletions': int(parts[1]) if parts[1] != '-' else 0 + } + return {'additions': 0, 'deletions': 0} +``` + +### Mapping Current Data to New Schema + +```python +def transform_component_node(component_id: str, component: ComponentNode) -> dict: + """Transform existing ComponentNode to new schema""" + return { + 'id': component_id, + 'parent_id': component.parent, + 'component_type': component.component_type, # already have this + 'name': component.name, + 'file_path': component.file_path, + 'old_line_number': None, # TODO: extract from git diff + 'new_line_number': None, # TODO: extract from git diff + 'change_type': component.change_type.value, + 'additions': None, # TODO: calculate per-component + 'deletions': None, # TODO: calculate per-component + 'summary': component.summary, + 'complexity': None, # Phase 2 + 'impact_radius': len(component.dependencies) + len(component.dependents), # Simple calculation + 'parameters': None, # Phase 2 + 'return_type': None # Phase 2 + } +``` + +## Testing Strategy + +### Unit Tests + +- Test file classification for all patterns +- Test transformation of each node/edge type +- Test git stat extraction +- Test handling of missing data (nulls) + +### Integration Tests + +- Transform a real diff and validate schema +- Compare with current format (should have same info) +- Test with various programming languages +- Test with renames, moves, deletions + +### Validation + +- JSON schema validation +- Graph completeness (all edge targets exist as nodes) +- No orphaned nodes +- Consistent change_types + +## Future Considerations + +### Scalability + +- Large diffs (100+ files): Streaming output? +- Deep dependency graphs: Limit impact_radius calculation? +- External services: Cache discovered endpoints? + +### Extensibility + +- Plugin system for custom relationship types +- Custom component types per language/framework +- User-defined classification patterns + +### Integration Examples + +- VSCode extension consuming this format +- GitHub PR bot showing visual diff +- CI/CD pipeline blocking risky changes +- Documentation auto-update from cross-references + +## Migration Path + +1. **v1.1.0** (Current): NetworkX format +2. **v1.2.0**: Add structured format, make it default for JSON +3. **v1.3.0**: Enhanced analysis (Phase 2) +4. **v2.0.0**: Deprecate NetworkX format, structured only + +## References + +- Current implementation: `diffgraph/graph_export.py` +- Graph management: `diffgraph/graph_manager.py` +- AI analysis: `diffgraph/ai_analysis.py` +- Related: [GRAPH_EXPORT_FEATURE.md](./GRAPH_EXPORT_FEATURE.md) + +--- + +**Status**: Ready for Phase 1 implementation +**Next Steps**: Implement basic restructuring without new analysis +**Review Date**: After Phase 1 ships From 5684051de9d01da42bf026a3e081d5d790490bb5 Mon Sep 17 00:00:00 2001 From: Avikalp Kumar Gupta Date: Sat, 25 Oct 2025 00:05:07 -0700 Subject: [PATCH 3/4] feat: Implement structured JSON output format - Phase 1 (v1.2.0) Add integration-friendly structured JSON format optimized for VSCode extensions, web UIs, and other tool integrations. This is Phase 1 using existing analysis data without requiring new AI analysis. New Features: - Structured JSON output with clean categorization and rich metadata - Automatic file classification into 4 categories: * auto_generated: Lock files, build artifacts (excluded from review) * documentation: Docs with potential cross-references to code * configuration: Config files with change tracking * source_code: Source files with full dependency graphs - Git diff stats (additions/deletions) extracted per file - Impact radius calculation from dependency graph - Complete graph structure (all edge targets exist as nodes) - Pattern-based classification with 40+ common file patterns New Files: - diffgraph/structured_export.py: Core transformation logic (400+ lines) * File categorization and classification * Component and edge transformation * Git stats extraction * Structured format generation - test_structured_export.py: Comprehensive test suite * File classification tests * Component transformation tests * Full export validation * Graph completeness tests * Empty graph handling Modified Files: - diffgraph/cli.py: Route JSON format to structured export - README.md: Document structured format with examples - CHANGELOG.md: Added v1.2.0 release notes - setup.py: Bumped version to 1.2.0 - diffgraph/__init__.py: Updated version to 1.2.0 Breaking Changes: - JSON format now outputs structured format (v2.0 schema) - NetworkX format still available via pickle/graphml Technical Details: - Phase 1: Uses existing data, leaves advanced fields as null - Relationship types: imports, calls, extends, implements - Change types: added, deleted, modified, unchanged - Clean separation of files graph and components graph - All tests pass (5/5 test suites) Phase 2 Planned: - Full codebase impact analysis - External dependency nodes (APIs, databases) - Advanced relationships (REST, RPC, pub/sub) - Complexity metrics and line numbers - Cross-reference detection See docs/planning/STRUCTURED_OUTPUT_DESIGN.md for complete specification. --- CHANGELOG.md | 29 +++ README.md | 101 ++++++-- diffgraph/__init__.py | 2 +- diffgraph/cli.py | 8 +- diffgraph/structured_export.py | 433 +++++++++++++++++++++++++++++++++ setup.py | 2 +- test_structured_export.py | 304 +++++++++++++++++++++++ 7 files changed, 852 insertions(+), 27 deletions(-) create mode 100644 diffgraph/structured_export.py create mode 100644 test_structured_export.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 368db30..ce85ca4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.2.0] - 2025-10-24 + +### Added +- **Structured JSON Output Format (Phase 1)**: Integration-friendly JSON format optimized for VSCode extensions and UIs + - New `diffgraph/structured_export.py` module for structured data transformation + - Automatic file categorization: `auto_generated`, `documentation`, `configuration`, `source_code` + - Rich metadata including git diff stats (additions/deletions per file) + - Impact radius calculation from dependency graphs + - Clean separation of files and components with explicit graph structure + - Complete graph validation (all edge targets exist as nodes) + - Pattern-based classification with 40+ common patterns +- Comprehensive test suite (`test_structured_export.py`) for structured export +- Design documentation in `docs/planning/` for future enhancements + +### Changed +- **JSON format now uses structured output by default** (breaking change for JSON, but backwards compatible overall) + - `--format graph --graph-format json`: Now outputs structured format + - `--format graph --graph-format pickle`: Still uses NetworkX format + - `--format graph --graph-format graphml`: Still uses NetworkX format +- Updated README with structured JSON examples and usage patterns +- Enhanced documentation to explain categorization and structure + +### Technical Details +- Phase 1 implementation uses existing analysis data +- Advanced fields (complexity, line numbers, parameters) reserved for Phase 2 +- External dependency nodes reserved for Phase 2 +- Advanced relationship detection (REST/RPC/pub-sub) reserved for Phase 2 +- Structure designed for iterative enhancement without breaking changes + ## [1.1.0] - 2025-10-24 ### Added diff --git a/README.md b/README.md index 60dc238..40ab0e9 100644 --- a/README.md +++ b/README.md @@ -89,46 +89,99 @@ The generated HTML report includes: - Responsive design for all screen sizes ### Graph Data Export -When using `--format graph`, the tool exports the complete networkx graph data structure, allowing other programs to programmatically analyze the code changes: +When using `--format graph`, the tool exports graph data, allowing other programs to programmatically analyze the code changes: **Supported formats:** -- **JSON** (default): Human-readable, widely compatible format -- **Pickle**: Python-specific format that preserves exact data structures +- **JSON** (default): Structured, integration-friendly format optimized for VSCode extensions and UIs +- **Pickle**: Python-specific NetworkX format that preserves exact data structures - **GraphML**: Standard graph format compatible with many graph analysis tools +#### Structured JSON Format (Default) + +The JSON export provides a clean, categorized structure ideal for integrations: + +**File Categorization:** +- **auto_generated**: Lock files, build artifacts (excluded from review) +- **documentation**: Markdown, docs with cross-references to code +- **configuration**: Config files with structured change tracking +- **source_code**: Source files with full dependency graphs + **Exported data includes:** -- File-level dependency graph with metadata (status, change type, summary) +- File-level dependency graph with additions/deletions - Component-level dependency graph (functions, classes, methods) -- Complete analysis results for each file and component -- Relationships between components (dependencies and dependents) +- Change types for all nodes and edges +- Impact radius (number of dependent components) +- Git diff statistics per file +- Comprehensive metadata + +**Example JSON structure:** +```json +{ + "version": "2.0", + "metadata": { + "analyzed_at": "2025-10-24T23:00:00Z", + "total_files_changed": 12, + "total_additions": 1296, + "total_deletions": 28 + }, + "auto_generated": [...], + "documentation": {...}, + "configuration": {...}, + "source_code": { + "files": { + "nodes": [{"path": "...", "additions": 10, ...}], + "edges": [{"source": "...", "target": "...", "relationship": "imports"}] + }, + "components": { + "nodes": [{"id": "...", "name": "...", "impact_radius": 5, ...}], + "edges": [{"source": "...", "target": "...", "relationship": "calls"}] + } + } +} +``` -**Example: Loading and using exported graph data** +**Using structured JSON data:** ```python -from diffgraph.graph_export import load_graph_from_json -import networkx as nx +import json + +# Load the structured JSON +with open('diffgraph.json', 'r') as f: + data = json.load(f) + +# Access categorized files +print(f"Source files: {len(data['source_code']['files']['nodes'])}") +print(f"Documentation: {len(data['documentation'])}") +print(f"Auto-generated: {len(data['auto_generated'])}") + +# Access components +for component in data['source_code']['components']['nodes']: + print(f"{component['name']} ({component['component_type']})") + print(f" Impact radius: {component['impact_radius']}") + print(f" Change type: {component['change_type']}") + +# Access dependencies +for edge in data['source_code']['components']['edges']: + print(f"{edge['source']} -> {edge['target']} ({edge['relationship']})") +``` -# Load the exported graph data -graph_manager = load_graph_from_json('diffgraph.json') +#### NetworkX Format (Pickle/GraphML) -# Access file nodes -for file_path, file_node in graph_manager.file_nodes.items(): - print(f"File: {file_path}") - print(f" Status: {file_node.status.value}") - print(f" Change Type: {file_node.change_type.value}") - print(f" Summary: {file_node.summary}") +For advanced analysis or Python-specific use cases: -# Access component nodes -for component_id, component_node in graph_manager.component_nodes.items(): - print(f"Component: {component_node.name}") - print(f" Type: {component_node.component_type}") - print(f" Dependencies: {component_node.dependencies}") +```python +from diffgraph.graph_export import load_graph_from_pickle +import networkx as nx -# Use networkx to analyze the graphs -print(f"Total files: {graph_manager.file_graph.number_of_nodes()}") +# Load NetworkX format +graph_manager = load_graph_from_pickle('diffgraph.pkl') + +# Use NetworkX algorithms print(f"Total components: {graph_manager.component_graph.number_of_nodes()}") print(f"Component dependencies: {graph_manager.component_graph.number_of_edges()}") ``` +**See also**: [Structured Output Design](docs/planning/STRUCTURED_OUTPUT_DESIGN.md) for complete schema specification + ## ๐Ÿค Contributing Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/diffgraph/__init__.py b/diffgraph/__init__.py index eb877fc..f07ec34 100644 --- a/diffgraph/__init__.py +++ b/diffgraph/__init__.py @@ -2,4 +2,4 @@ DiffGraph - A CLI tool for visualizing code changes with AI """ -__version__ = "1.1.0" \ No newline at end of file +__version__ = "1.2.0" \ No newline at end of file diff --git a/diffgraph/cli.py b/diffgraph/cli.py index 18bc00e..2c2f99e 100644 --- a/diffgraph/cli.py +++ b/diffgraph/cli.py @@ -8,6 +8,7 @@ from diffgraph.ai_analysis import CodeAnalysisAgent from diffgraph.html_report import generate_html_report, AnalysisResult from diffgraph.graph_export import export_graph +from diffgraph.structured_export import export_structured_json from diffgraph.env_loader import load_env_file, debug_environment from diffgraph.utils import sanitize_diff_args, involves_working_tree @@ -210,7 +211,12 @@ def progress_callback(current_file, total_files, status): if format == 'graph': # Export graph data click.echo(f"๐Ÿ’พ Exporting graph data in {graph_format} format...") - graph_path = export_graph(agent.graph_manager, output, graph_format) + if graph_format == 'json': + # Use structured format for JSON + graph_path = export_structured_json(agent.graph_manager, output, diff_args) + else: + # Use NetworkX format for pickle/graphml + graph_path = export_graph(agent.graph_manager, output, graph_format) click.echo(f"โœ… Graph data exported: {graph_path}") else: # Create analysis result diff --git a/diffgraph/structured_export.py b/diffgraph/structured_export.py new file mode 100644 index 0000000..49034ec --- /dev/null +++ b/diffgraph/structured_export.py @@ -0,0 +1,433 @@ +""" +Structured export module for generating integration-friendly JSON output. + +This module transforms the internal NetworkX graph representation into a +structured format optimized for consumption by VSCode extensions, web UIs, +and other integrations. + +Phase 1: Uses existing analysis data, leaves advanced fields as null. +Phase 2+: Will add full codebase analysis, external dependencies, etc. +""" + +import json +import subprocess +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from datetime import datetime, timezone +from .graph_manager import GraphManager, FileNode, ComponentNode, ChangeType + + +# File classification patterns +AUTO_GENERATED_PATTERNS = [ + '*-lock.json', '*.lock', 'yarn.lock', 'Gemfile.lock', 'Cargo.lock', + '*.min.js', '*.min.css', '*.bundle.js', + 'dist/*', 'build/*', 'target/*', 'out/*', '.next/*', + '__pycache__/*', '*.pyc', '*.class', '*.o', '*.so', + 'package-lock.json', 'composer.lock', 'poetry.lock', + '.DS_Store', 'Thumbs.db' +] + +DOC_PATTERNS = [ + '*.md', '*.rst', '*.adoc', + 'docs/*', 'documentation/*', 'doc/*', + 'CHANGELOG.*', 'HISTORY.*', 'AUTHORS.*', 'CONTRIBUTORS.*' +] + +CONFIG_PATTERNS = [ + '*.toml', '*.yaml', '*.yml', 'setup.py', 'setup.cfg', 'pyproject.toml', + '.*rc', '.*ignore', 'Makefile', 'Dockerfile', 'docker-compose.yml', + 'package.json', 'tsconfig.json', 'webpack.config.js', + 'requirements.txt', 'Pipfile', 'Gemfile' +] + + +def matches_pattern(file_path: str, patterns: List[str]) -> bool: + """Check if file path matches any of the given patterns.""" + from fnmatch import fnmatch + for pattern in patterns: + if fnmatch(file_path, pattern): + return True + return False + + +def classify_file(file_path: str) -> str: + """ + Classify a file into one of four categories. + + Priority order: + 1. Auto-generated (highest priority) + 2. Configuration (before docs to catch requirements.txt) + 3. Documentation + 4. Source code (default) + + Args: + file_path: Path to the file + + Returns: + One of: 'auto_generated', 'documentation', 'configuration', 'source_code' + """ + if matches_pattern(file_path, AUTO_GENERATED_PATTERNS): + return 'auto_generated' + if matches_pattern(file_path, CONFIG_PATTERNS): + return 'configuration' + if matches_pattern(file_path, DOC_PATTERNS): + return 'documentation' + return 'source_code' + + +def get_file_stats(file_path: str, diff_args: List[str]) -> Dict[str, int]: + """ + Get git diff stats (additions/deletions) for a file. + + Args: + file_path: Path to the file + diff_args: Git diff arguments (e.g., ['HEAD~1', 'HEAD']) + + Returns: + Dictionary with 'additions' and 'deletions' counts + """ + try: + cmd = ['git', 'diff', '--numstat'] + diff_args + ['--', file_path] + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + + if result.returncode == 0 and result.stdout.strip(): + # Output format: "additions\tdeletions\tfilename" + parts = result.stdout.strip().split('\t') + if len(parts) >= 2: + additions = int(parts[0]) if parts[0] != '-' else 0 + deletions = int(parts[1]) if parts[1] != '-' else 0 + return {'additions': additions, 'deletions': deletions} + except Exception as e: + print(f"Warning: Could not get stats for {file_path}: {e}") + + return {'additions': 0, 'deletions': 0} + + +def get_language_from_extension(file_path: str) -> Optional[str]: + """Determine programming language from file extension.""" + ext_map = { + '.py': 'python', + '.js': 'javascript', + '.ts': 'typescript', + '.jsx': 'javascript', + '.tsx': 'typescript', + '.java': 'java', + '.go': 'go', + '.rs': 'rust', + '.rb': 'ruby', + '.php': 'php', + '.c': 'c', + '.cpp': 'cpp', + '.h': 'c', + '.hpp': 'cpp', + '.cs': 'csharp', + '.swift': 'swift', + '.kt': 'kotlin', + '.sh': 'shell', + '.bash': 'shell', + } + + ext = Path(file_path).suffix + return ext_map.get(ext) + + +def determine_relationship_type(source_id: str, target_id: str, + graph_manager: GraphManager) -> str: + """ + Determine the relationship type between two components. + + Phase 1: Simple heuristics based on names and types. + Phase 2: Will use AI and pattern matching for advanced detection. + + Args: + source_id: Source component ID + target_id: Target component ID + graph_manager: GraphManager instance + + Returns: + Relationship type string + """ + # Default to 'calls' + relationship = 'calls' + + # Get component types if available + source = graph_manager.component_nodes.get(source_id) + target = graph_manager.component_nodes.get(target_id) + + if source and target: + # Check for inheritance patterns + if source.component_type == 'class' and target.component_type == 'class': + if 'extends' in (source.summary or '').lower() or 'inherit' in (source.summary or '').lower(): + relationship = 'extends' + + # Check for interface implementation + if source.component_type == 'class' and target.component_type == 'interface': + relationship = 'implements' + + # If both are in same file, might be internal reference + if source.file_path == target.file_path: + if source.parent_id == target_id: + relationship = 'member_of' + + return relationship + + +def transform_component_node(component_id: str, + component: ComponentNode, + graph_manager: GraphManager) -> Dict[str, Any]: + """ + Transform a ComponentNode to structured format. + + Phase 1: Uses existing data, leaves some fields as null. + """ + # Simple impact radius calculation from existing dependencies + impact_radius = len(component.dependencies) + len(component.dependents) + + return { + 'id': component_id, + 'parent_id': component.parent, + 'component_type': component.component_type, + 'name': component.name, + 'file_path': component.file_path, + 'old_line_number': None, # Phase 2: extract from git diff + 'new_line_number': None, # Phase 2: extract from git diff + 'change_type': component.change_type.value, + 'additions': None, # Phase 2: per-component diff analysis + 'deletions': None, # Phase 2: per-component diff analysis + 'summary': component.summary, + 'complexity': None, # Phase 2: cyclomatic complexity + 'impact_radius': impact_radius, + 'parameters': None, # Phase 2: signature parsing + 'return_type': None # Phase 2: signature parsing + } + + +def transform_file_node(file_path: str, + file_node: FileNode, + diff_args: List[str]) -> Dict[str, Any]: + """ + Transform a FileNode to structured format. + """ + stats = get_file_stats(file_path, diff_args) + language = get_language_from_extension(file_path) + + return { + 'path': file_path, + 'name': Path(file_path).name, + 'type': 'test' if 'test' in file_path.lower() else 'src', + 'change_type': file_node.change_type.value, + 'additions': stats['additions'], + 'deletions': stats['deletions'], + 'summary': file_node.summary or '', + 'language': language, + 'old_path': None # Phase 2: detect file renames + } + + +def transform_component_edge(source: str, + target: str, + graph_manager: GraphManager) -> Dict[str, Any]: + """ + Transform a component edge to structured format. + """ + relationship = determine_relationship_type(source, target, graph_manager) + + # For Phase 1, we'll mark all edges as 'added' if they exist in the graph + # Phase 2 will properly detect added/deleted/modified/unchanged + return { + 'source': source, + 'target': target, + 'relationship': relationship, + 'change_type': 'added', # Simplified for Phase 1 + 'summary': '' # Phase 2: generate edge-specific summaries + } + + +def transform_file_edge(source: str, + target: str, + graph_manager: GraphManager) -> Dict[str, Any]: + """ + Transform a file edge to structured format. + """ + return { + 'source': source, + 'target': target, + 'relationship': 'imports', + 'change_type': 'added', # Simplified for Phase 1 + 'summary': f'{Path(source).name} imports {Path(target).name}' + } + + +def categorize_files(graph_manager: GraphManager, + diff_args: List[str]) -> Tuple[List[Dict], Dict[str, Dict], + Dict[str, Dict], Dict[str, Any]]: + """ + Categorize files into auto_generated, documentation, configuration, and source_code. + + Returns: + Tuple of (auto_generated, documentation, configuration, source_code) + """ + auto_generated = [] + documentation = {} + configuration = {} + source_files = [] + + for file_path, file_node in graph_manager.file_nodes.items(): + category = classify_file(file_path) + stats = get_file_stats(file_path, diff_args) + + if category == 'auto_generated': + auto_generated.append({ + 'path': file_path, + 'classification_method': 'pattern', + 'reason': 'Matches auto-generated file pattern', + 'additions': stats['additions'], + 'deletions': stats['deletions'] + }) + + elif category == 'documentation': + documentation[file_path] = { + 'additions': stats['additions'], + 'deletions': stats['deletions'], + 'summary': file_node.summary or '', + 'sections_modified': [], # Phase 2 + 'cross_references': [] # Phase 2 + } + + elif category == 'configuration': + configuration[file_path] = { + 'additions': stats['additions'], + 'deletions': stats['deletions'], + 'summary': file_node.summary or '', + 'config_changes': [], # Phase 2 + 'cross_references': [] # Phase 2 + } + + else: # source_code + source_files.append(file_path) + + return auto_generated, documentation, configuration, source_files + + +def transform_to_structured_format(graph_manager: GraphManager, + diff_args: List[str], + diff_base: str = 'main', + diff_target: str = 'HEAD') -> Dict[str, Any]: + """ + Transform NetworkX graph data to structured format. + + Args: + graph_manager: GraphManager instance with analyzed data + diff_args: Git diff arguments used for analysis + diff_base: Base git ref for the diff + diff_target: Target git ref for the diff + + Returns: + Structured format dictionary + """ + # Categorize files + auto_gen, docs, config, source_files = categorize_files(graph_manager, diff_args) + + # Calculate total stats + total_files = len(graph_manager.file_nodes) + total_additions = sum(get_file_stats(f, diff_args)['additions'] + for f in graph_manager.file_nodes.keys()) + total_deletions = sum(get_file_stats(f, diff_args)['deletions'] + for f in graph_manager.file_nodes.keys()) + + # Build source code section + source_code = { + 'files': { + 'nodes': [], + 'edges': [] + }, + 'components': { + 'nodes': [], + 'edges': [] + } + } + + # Add file nodes for source code files + for file_path in source_files: + file_node = graph_manager.file_nodes[file_path] + source_code['files']['nodes'].append( + transform_file_node(file_path, file_node, diff_args) + ) + + # Add file edges + for source, target in graph_manager.file_graph.edges(): + if source in source_files and target in source_files: + source_code['files']['edges'].append( + transform_file_edge(source, target, graph_manager) + ) + + # Add component nodes + for component_id, component in graph_manager.component_nodes.items(): + # Only include components from source code files + if component.file_path in source_files: + source_code['components']['nodes'].append( + transform_component_node(component_id, component, graph_manager) + ) + + # Add component edges + for source, target in graph_manager.component_graph.edges(): + # Only include edges where both nodes are in source code + source_comp = graph_manager.component_nodes.get(source) + target_comp = graph_manager.component_nodes.get(target) + if (source_comp and target_comp and + source_comp.file_path in source_files and + target_comp.file_path in source_files): + source_code['components']['edges'].append( + transform_component_edge(source, target, graph_manager) + ) + + # Build final structure + return { + 'version': '2.0', + 'metadata': { + 'analyzed_at': datetime.now(timezone.utc).isoformat(), + 'diff_base': diff_base, + 'diff_target': diff_target, + 'total_files_changed': total_files, + 'total_additions': total_additions, + 'total_deletions': total_deletions, + 'analyzer_version': '1.2.0' # TODO: Get from package + }, + 'auto_generated': auto_gen, + 'documentation': docs, + 'configuration': config, + 'source_code': source_code + } + + +def export_structured_json(graph_manager: GraphManager, + output_path: str, + diff_args: List[str] = None, + diff_base: str = 'main', + diff_target: str = 'HEAD') -> str: + """ + Export graph data in structured JSON format. + + Args: + graph_manager: GraphManager instance with analyzed data + output_path: Path where JSON file should be saved + diff_args: Git diff arguments (default: empty list) + diff_base: Base git ref for the diff + diff_target: Target git ref for the diff + + Returns: + Absolute path to the generated JSON file + """ + if diff_args is None: + diff_args = [] + + # Transform to structured format + structured_data = transform_to_structured_format( + graph_manager, diff_args, diff_base, diff_target + ) + + # Write to JSON file + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(structured_data, f, indent=2, ensure_ascii=False) + + return str(Path(output_path).absolute()) diff --git a/setup.py b/setup.py index 5624d69..4ceb746 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="wild", - version="1.1.0", + version="1.2.0", packages=find_packages(), install_requires=[ "click>=8.1.7", diff --git a/test_structured_export.py b/test_structured_export.py new file mode 100644 index 0000000..2cfaa96 --- /dev/null +++ b/test_structured_export.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python +""" +Test suite for structured export functionality. +Tests Phase 1 implementation of integration-friendly JSON output. +""" + +import json +import os +from diffgraph.graph_manager import GraphManager, ChangeType +from diffgraph.structured_export import ( + classify_file, + export_structured_json, + transform_component_node, + transform_file_node, + transform_to_structured_format +) + + +def test_file_classification(): + """Test file classification into categories.""" + print("Testing file classification...") + + # Auto-generated files + assert classify_file('package-lock.json') == 'auto_generated' + assert classify_file('yarn.lock') == 'auto_generated' + assert classify_file('dist/bundle.min.js') == 'auto_generated' + assert classify_file('build/output.js') == 'auto_generated' + assert classify_file('__pycache__/module.pyc') == 'auto_generated' + + # Documentation files + assert classify_file('README.md') == 'documentation' + assert classify_file('docs/guide.rst') == 'documentation' + assert classify_file('CHANGELOG.md') == 'documentation' + + # Configuration files + assert classify_file('setup.py') == 'configuration' + assert classify_file('pyproject.toml') == 'configuration' + assert classify_file('package.json') == 'configuration' + assert classify_file('.eslintrc') == 'configuration' + assert classify_file('requirements.txt') == 'configuration' + + # Source code files + assert classify_file('main.py') == 'source_code' + assert classify_file('src/app.js') == 'source_code' + assert classify_file('lib/utils.go') == 'source_code' + + print("โœ… File classification tests passed") + + +def test_component_transformation(): + """Test component node transformation.""" + print("\nTesting component transformation...") + + gm = GraphManager() + gm.add_file('test.py', ChangeType.MODIFIED) + gm.add_component('TestClass', 'test.py', ChangeType.MODIFIED, 'class', + summary='A test class', dependencies=['OtherClass'], + dependents=['UsageClass']) + + component_id = 'test.py::TestClass' + component = gm.component_nodes[component_id] + + result = transform_component_node(component_id, component, gm) + + assert result['id'] == component_id + assert result['name'] == 'TestClass' + assert result['component_type'] == 'class' + assert result['change_type'] == 'modified' + assert result['summary'] == 'A test class' + assert result['impact_radius'] == 2 # 1 dependency + 1 dependent + assert result['complexity'] is None # Phase 2 + assert result['parameters'] is None # Phase 2 + + print("โœ… Component transformation tests passed") + + +def test_structured_export(): + """Test full structured export.""" + print("\nTesting structured export...") + + # Create test graph + gm = GraphManager() + + # Add source files + gm.add_file('src/main.py', ChangeType.MODIFIED) + gm.add_file('src/utils.py', ChangeType.ADDED) + gm.add_file('tests/test_main.py', ChangeType.ADDED) + + # Add documentation + gm.add_file('README.md', ChangeType.MODIFIED) + + # Add configuration + gm.add_file('setup.py', ChangeType.MODIFIED) + + # Add auto-generated + gm.add_file('package-lock.json', ChangeType.MODIFIED) + + # Add components + gm.add_component('MainClass', 'src/main.py', ChangeType.MODIFIED, 'class', + summary='Main application class') + gm.add_component('run', 'src/main.py', ChangeType.MODIFIED, 'method', + parent='MainClass', summary='Run method') + gm.add_component('helper', 'src/utils.py', ChangeType.ADDED, 'function', + summary='Helper function') + + # Add dependency + gm.add_component_dependency('src/main.py::run', 'src/utils.py::helper') + + # Mark as processed + gm.mark_processed('src/main.py', 'Modified main file', []) + gm.mark_processed('src/utils.py', 'Added utils', []) + gm.mark_processed('tests/test_main.py', 'Added tests', []) + gm.mark_processed('README.md', 'Updated docs', []) + gm.mark_processed('setup.py', 'Version bump', []) + gm.mark_processed('package-lock.json', 'Dependency update', []) + + # Export + output_path = 'test_structured_output.json' + result_path = export_structured_json(gm, output_path, diff_args=[]) + + assert os.path.exists(output_path) + print(f" Generated: {result_path}") + + # Load and validate + with open(output_path, 'r') as f: + data = json.load(f) + + # Validate top-level structure + assert 'version' in data + assert data['version'] == '2.0' + assert 'metadata' in data + assert 'auto_generated' in data + assert 'documentation' in data + assert 'configuration' in data + assert 'source_code' in data + + print(f" Version: {data['version']}") + + # Validate metadata + metadata = data['metadata'] + assert 'analyzed_at' in metadata + assert 'total_files_changed' in metadata + assert metadata['total_files_changed'] == 6 + + print(f" Files changed: {metadata['total_files_changed']}") + + # Validate categorization + assert len(data['auto_generated']) == 1 + assert data['auto_generated'][0]['path'] == 'package-lock.json' + + assert 'README.md' in data['documentation'] + + assert 'setup.py' in data['configuration'] + + print(f" Auto-generated: {len(data['auto_generated'])}") + print(f" Documentation: {len(data['documentation'])}") + print(f" Configuration: {len(data['configuration'])}") + + # Validate source code structure + source = data['source_code'] + assert 'files' in source + assert 'components' in source + + files = source['files'] + assert 'nodes' in files + assert 'edges' in files + + # Should have 3 source files (main.py, utils.py, test_main.py) + assert len(files['nodes']) == 3 + + print(f" Source files: {len(files['nodes'])}") + + components = source['components'] + assert 'nodes' in components + assert 'edges' in components + + # Should have 3 components + assert len(components['nodes']) == 3 + + print(f" Components: {len(components['nodes'])}") + + # Should have 1 edge (run -> helper) + assert len(components['edges']) == 1 + + print(f" Component edges: {len(components['edges'])}") + + # Validate component structure + component_node = components['nodes'][0] + assert 'id' in component_node + assert 'name' in component_node + assert 'component_type' in component_node + assert 'change_type' in component_node + assert 'summary' in component_node + assert 'impact_radius' in component_node + + # Check Phase 1 null fields + assert component_node['complexity'] is None + assert component_node['old_line_number'] is None + assert component_node['new_line_number'] is None + assert component_node['parameters'] is None + + print(" โœ“ Component structure validated") + + # Validate edge structure + edge = components['edges'][0] + assert 'source' in edge + assert 'target' in edge + assert 'relationship' in edge + assert 'change_type' in edge + assert edge['source'] == 'src/main.py::run' + assert edge['target'] == 'src/utils.py::helper' + + print(" โœ“ Edge structure validated") + + # Cleanup + os.remove(output_path) + print(" โœ“ Cleaned up test file") + + print("โœ… Structured export tests passed") + + +def test_graph_completeness(): + """Test that all edge targets exist as nodes.""" + print("\nTesting graph completeness...") + + gm = GraphManager() + gm.add_file('file1.py', ChangeType.MODIFIED) + gm.add_file('file2.py', ChangeType.ADDED) + + gm.add_component('Func1', 'file1.py', ChangeType.MODIFIED, 'function') + gm.add_component('Func2', 'file2.py', ChangeType.ADDED, 'function') + + gm.add_component_dependency('file1.py::Func1', 'file2.py::Func2') + + gm.mark_processed('file1.py', 'File 1', []) + gm.mark_processed('file2.py', 'File 2', []) + + # Export and validate + output_path = 'test_completeness.json' + export_structured_json(gm, output_path, diff_args=[]) + + with open(output_path, 'r') as f: + data = json.load(f) + + # Get all component node IDs + component_ids = set(node['id'] for node in data['source_code']['components']['nodes']) + + # Verify all edge sources and targets exist in nodes + for edge in data['source_code']['components']['edges']: + assert edge['source'] in component_ids, f"Source {edge['source']} not in nodes" + assert edge['target'] in component_ids, f"Target {edge['target']} not in nodes" + + print(" โœ“ All edge sources exist as nodes") + print(" โœ“ All edge targets exist as nodes") + + os.remove(output_path) + + print("โœ… Graph completeness tests passed") + + +def test_empty_graph(): + """Test handling of empty graph.""" + print("\nTesting empty graph...") + + gm = GraphManager() + + output_path = 'test_empty.json' + export_structured_json(gm, output_path, diff_args=[]) + + with open(output_path, 'r') as f: + data = json.load(f) + + assert data['version'] == '2.0' + assert data['metadata']['total_files_changed'] == 0 + assert len(data['auto_generated']) == 0 + assert len(data['documentation']) == 0 + assert len(data['configuration']) == 0 + assert len(data['source_code']['files']['nodes']) == 0 + assert len(data['source_code']['components']['nodes']) == 0 + + os.remove(output_path) + + print("โœ… Empty graph tests passed") + + +def main(): + """Run all tests.""" + print("=" * 60) + print("Testing Structured Export (Phase 1)") + print("=" * 60) + + test_file_classification() + test_component_transformation() + test_structured_export() + test_graph_completeness() + test_empty_graph() + + print("\n" + "=" * 60) + print("โœ… All tests passed!") + print("=" * 60) + + +if __name__ == "__main__": + main() From d43d668f05e8b913c665d7b312c000c4e9875033 Mon Sep 17 00:00:00 2001 From: Avikalp Kumar Gupta Date: Tue, 28 Oct 2025 18:30:14 -0700 Subject: [PATCH 4/4] fix: json creation was failing because components don't have 'parent_id', but just 'parent'. refactor: Clean up whitespace in structured_export.py This commit removes unnecessary whitespace throughout the structured_export.py file, improving code readability without altering functionality. The changes include adjustments to spacing in function definitions, docstrings, and return statements, ensuring a more consistent coding style. --- diffgraph/structured_export.py | 110 ++++++++++++++++----------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/diffgraph/structured_export.py b/diffgraph/structured_export.py index 49034ec..06c6054 100644 --- a/diffgraph/structured_export.py +++ b/diffgraph/structured_export.py @@ -53,16 +53,16 @@ def matches_pattern(file_path: str, patterns: List[str]) -> bool: def classify_file(file_path: str) -> str: """ Classify a file into one of four categories. - + Priority order: 1. Auto-generated (highest priority) 2. Configuration (before docs to catch requirements.txt) 3. Documentation 4. Source code (default) - + Args: file_path: Path to the file - + Returns: One of: 'auto_generated', 'documentation', 'configuration', 'source_code' """ @@ -78,18 +78,18 @@ def classify_file(file_path: str) -> str: def get_file_stats(file_path: str, diff_args: List[str]) -> Dict[str, int]: """ Get git diff stats (additions/deletions) for a file. - + Args: file_path: Path to the file diff_args: Git diff arguments (e.g., ['HEAD~1', 'HEAD']) - + Returns: Dictionary with 'additions' and 'deletions' counts """ try: cmd = ['git', 'diff', '--numstat'] + diff_args + ['--', file_path] result = subprocess.run(cmd, capture_output=True, text=True, check=False) - + if result.returncode == 0 and result.stdout.strip(): # Output format: "additions\tdeletions\tfilename" parts = result.stdout.strip().split('\t') @@ -99,7 +99,7 @@ def get_file_stats(file_path: str, diff_args: List[str]) -> Dict[str, int]: return {'additions': additions, 'deletions': deletions} except Exception as e: print(f"Warning: Could not get stats for {file_path}: {e}") - + return {'additions': 0, 'deletions': 0} @@ -126,63 +126,63 @@ def get_language_from_extension(file_path: str) -> Optional[str]: '.sh': 'shell', '.bash': 'shell', } - + ext = Path(file_path).suffix return ext_map.get(ext) -def determine_relationship_type(source_id: str, target_id: str, +def determine_relationship_type(source_id: str, target_id: str, graph_manager: GraphManager) -> str: """ Determine the relationship type between two components. - + Phase 1: Simple heuristics based on names and types. Phase 2: Will use AI and pattern matching for advanced detection. - + Args: source_id: Source component ID target_id: Target component ID graph_manager: GraphManager instance - + Returns: Relationship type string """ # Default to 'calls' relationship = 'calls' - + # Get component types if available source = graph_manager.component_nodes.get(source_id) target = graph_manager.component_nodes.get(target_id) - + if source and target: # Check for inheritance patterns if source.component_type == 'class' and target.component_type == 'class': if 'extends' in (source.summary or '').lower() or 'inherit' in (source.summary or '').lower(): relationship = 'extends' - + # Check for interface implementation if source.component_type == 'class' and target.component_type == 'interface': relationship = 'implements' - + # If both are in same file, might be internal reference if source.file_path == target.file_path: - if source.parent_id == target_id: + if source.parent == target_id: relationship = 'member_of' - + return relationship -def transform_component_node(component_id: str, +def transform_component_node(component_id: str, component: ComponentNode, graph_manager: GraphManager) -> Dict[str, Any]: """ Transform a ComponentNode to structured format. - + Phase 1: Uses existing data, leaves some fields as null. """ # Simple impact radius calculation from existing dependencies impact_radius = len(component.dependencies) + len(component.dependents) - + return { 'id': component_id, 'parent_id': component.parent, @@ -202,7 +202,7 @@ def transform_component_node(component_id: str, } -def transform_file_node(file_path: str, +def transform_file_node(file_path: str, file_node: FileNode, diff_args: List[str]) -> Dict[str, Any]: """ @@ -210,7 +210,7 @@ def transform_file_node(file_path: str, """ stats = get_file_stats(file_path, diff_args) language = get_language_from_extension(file_path) - + return { 'path': file_path, 'name': Path(file_path).name, @@ -224,14 +224,14 @@ def transform_file_node(file_path: str, } -def transform_component_edge(source: str, +def transform_component_edge(source: str, target: str, graph_manager: GraphManager) -> Dict[str, Any]: """ Transform a component edge to structured format. """ relationship = determine_relationship_type(source, target, graph_manager) - + # For Phase 1, we'll mark all edges as 'added' if they exist in the graph # Phase 2 will properly detect added/deleted/modified/unchanged return { @@ -243,7 +243,7 @@ def transform_component_edge(source: str, } -def transform_file_edge(source: str, +def transform_file_edge(source: str, target: str, graph_manager: GraphManager) -> Dict[str, Any]: """ @@ -258,12 +258,12 @@ def transform_file_edge(source: str, } -def categorize_files(graph_manager: GraphManager, - diff_args: List[str]) -> Tuple[List[Dict], Dict[str, Dict], +def categorize_files(graph_manager: GraphManager, + diff_args: List[str]) -> Tuple[List[Dict], Dict[str, Dict], Dict[str, Dict], Dict[str, Any]]: """ Categorize files into auto_generated, documentation, configuration, and source_code. - + Returns: Tuple of (auto_generated, documentation, configuration, source_code) """ @@ -271,11 +271,11 @@ def categorize_files(graph_manager: GraphManager, documentation = {} configuration = {} source_files = [] - + for file_path, file_node in graph_manager.file_nodes.items(): category = classify_file(file_path) stats = get_file_stats(file_path, diff_args) - + if category == 'auto_generated': auto_generated.append({ 'path': file_path, @@ -284,7 +284,7 @@ def categorize_files(graph_manager: GraphManager, 'additions': stats['additions'], 'deletions': stats['deletions'] }) - + elif category == 'documentation': documentation[file_path] = { 'additions': stats['additions'], @@ -293,7 +293,7 @@ def categorize_files(graph_manager: GraphManager, 'sections_modified': [], # Phase 2 'cross_references': [] # Phase 2 } - + elif category == 'configuration': configuration[file_path] = { 'additions': stats['additions'], @@ -302,39 +302,39 @@ def categorize_files(graph_manager: GraphManager, 'config_changes': [], # Phase 2 'cross_references': [] # Phase 2 } - + else: # source_code source_files.append(file_path) - + return auto_generated, documentation, configuration, source_files -def transform_to_structured_format(graph_manager: GraphManager, +def transform_to_structured_format(graph_manager: GraphManager, diff_args: List[str], diff_base: str = 'main', diff_target: str = 'HEAD') -> Dict[str, Any]: """ Transform NetworkX graph data to structured format. - + Args: graph_manager: GraphManager instance with analyzed data diff_args: Git diff arguments used for analysis diff_base: Base git ref for the diff diff_target: Target git ref for the diff - + Returns: Structured format dictionary """ # Categorize files auto_gen, docs, config, source_files = categorize_files(graph_manager, diff_args) - + # Calculate total stats total_files = len(graph_manager.file_nodes) - total_additions = sum(get_file_stats(f, diff_args)['additions'] + total_additions = sum(get_file_stats(f, diff_args)['additions'] for f in graph_manager.file_nodes.keys()) - total_deletions = sum(get_file_stats(f, diff_args)['deletions'] + total_deletions = sum(get_file_stats(f, diff_args)['deletions'] for f in graph_manager.file_nodes.keys()) - + # Build source code section source_code = { 'files': { @@ -346,21 +346,21 @@ def transform_to_structured_format(graph_manager: GraphManager, 'edges': [] } } - + # Add file nodes for source code files for file_path in source_files: file_node = graph_manager.file_nodes[file_path] source_code['files']['nodes'].append( transform_file_node(file_path, file_node, diff_args) ) - + # Add file edges for source, target in graph_manager.file_graph.edges(): if source in source_files and target in source_files: source_code['files']['edges'].append( transform_file_edge(source, target, graph_manager) ) - + # Add component nodes for component_id, component in graph_manager.component_nodes.items(): # Only include components from source code files @@ -368,19 +368,19 @@ def transform_to_structured_format(graph_manager: GraphManager, source_code['components']['nodes'].append( transform_component_node(component_id, component, graph_manager) ) - + # Add component edges for source, target in graph_manager.component_graph.edges(): # Only include edges where both nodes are in source code source_comp = graph_manager.component_nodes.get(source) target_comp = graph_manager.component_nodes.get(target) - if (source_comp and target_comp and - source_comp.file_path in source_files and + if (source_comp and target_comp and + source_comp.file_path in source_files and target_comp.file_path in source_files): source_code['components']['edges'].append( transform_component_edge(source, target, graph_manager) ) - + # Build final structure return { 'version': '2.0', @@ -400,34 +400,34 @@ def transform_to_structured_format(graph_manager: GraphManager, } -def export_structured_json(graph_manager: GraphManager, +def export_structured_json(graph_manager: GraphManager, output_path: str, diff_args: List[str] = None, diff_base: str = 'main', diff_target: str = 'HEAD') -> str: """ Export graph data in structured JSON format. - + Args: graph_manager: GraphManager instance with analyzed data output_path: Path where JSON file should be saved diff_args: Git diff arguments (default: empty list) diff_base: Base git ref for the diff diff_target: Target git ref for the diff - + Returns: Absolute path to the generated JSON file """ if diff_args is None: diff_args = [] - + # Transform to structured format structured_data = transform_to_structured_format( graph_manager, diff_args, diff_base, diff_target ) - + # Write to JSON file with open(output_path, 'w', encoding='utf-8') as f: json.dump(structured_data, f, indent=2, ensure_ascii=False) - + return str(Path(output_path).absolute())