Added project proposal and quality metrics

saisandeepramavath · saisandeepramavath · commit 603f06f82acc · 2025-09-23T15:10:21.000-04:00
diff --git a/courseProjectCode/Metrics/metrics_collector.py b/courseProjectCode/Metrics/metrics_collector.py
@@ -0,0 +1,224 @@
+
+
+import ast
+import json
+import os
+import re
+from typing import Dict, List, Tuple
+
+
+ROOT_DIR = os.getcwd()
+
+SKIP_DIRS = {
+    "node_modules",
+    "courseProjectDocs",
+    "courseProjectCode", 
+    ".git",
+    "__pycache__",
+}
+
+SOURCE_EXTENSIONS = {".py"}
+
+
+def count_python_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
+    try:
+        tree = ast.parse(file_content)
+    except SyntaxError:
+        return 0, []
+
+    function_spans = []
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            # end_lineno is available in Python 3.8+
+            start_line = getattr(node, "lineno", None)
+            end_line = getattr(node, "end_lineno", None)
+            if start_line is not None and end_line is not None:
+                function_spans.append((start_line, end_line))
+    return len(function_spans), function_spans
+
+
+def count_js_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
+    lines = file_content.splitlines()
+    count = 0
+    spans = []
+    for idx, line in enumerate(lines, start=1):
+        stripped = line.strip()
+        if stripped.startswith("//") or stripped.startswith("/*"):
+            continue
+        if re.search(r"\bfunction\b", stripped) or re.search(r"=>", stripped):
+            count += 1
+            spans.append((idx, idx))
+    return count, spans
+
+
+def approximate_cyclomatic_complexity(lines: List[str]) -> int:
+    complexity = 1  # Base complexity
+    decision_keywords = [
+        "if ", "for ", "while ", "case ", "switch ", "catch ", "&&", "||", "?",
+        "elif ", "except ",
+    ]
+    for line in lines:
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#") or stripped.startswith("//"):
+            continue
+        for keyword in decision_keywords:
+            if keyword in stripped:
+                complexity += 1
+                break
+    return complexity
+
+
+def analyse_file(filepath: str) -> Dict[str, object]:
+    try:
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+    except (OSError, UnicodeDecodeError):
+        return {}
+
+    lines = content.splitlines()
+    code_lines = 0
+    comment_lines = 0
+    in_block_comment = False
+
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if in_block_comment:
+            comment_lines += 1
+            if "*/" in stripped:
+                in_block_comment = False
+            continue
+        if stripped.startswith("/*"):
+            comment_lines += 1
+            if "*/" not in stripped:
+                in_block_comment = True
+            continue
+        if stripped.startswith("#") or stripped.startswith("//"):
+            comment_lines += 1
+            continue
+        if stripped.startswith("\"\"\""):
+            comment_lines += 1
+            continue
+        code_lines += 1
+
+    ext = os.path.splitext(filepath)[1]
+    functions_count = 0
+    function_spans: List[Tuple[int, int]] = []
+    if ext == ".py":
+        functions_count, function_spans = count_python_functions(content)
+    elif ext == ".js":
+        functions_count, function_spans = count_js_functions(content)
+
+    total_function_lines = 0
+    for start, end in function_spans:
+        if end >= start:
+            total_function_lines += end - start + 1
+    average_function_length = (
+        (total_function_lines / functions_count) if functions_count > 0 else 0
+    )
+
+    complexity = approximate_cyclomatic_complexity(lines)
+
+    parts = filepath.lower().split(os.sep)
+    is_test_file = any(
+        part.startswith("test") for part in parts if part not in {"", "."}
+    )
+
+    test_functions_count = 0
+    if is_test_file:
+        if ext == ".py":
+            try:
+                tree = ast.parse(content)
+            except SyntaxError:
+                tree = None
+            if tree is not None:
+                for node in ast.walk(tree):
+                    if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                        if node.name.startswith("test"):
+                            test_functions_count += 1
+        elif ext == ".js":
+            test_functions_count = len(re.findall(r"\b(it|describe)\s*\(", content))
+
+    return {
+        "file": filepath,
+        "lines_of_code": code_lines,
+        "comment_lines": comment_lines,
+        "comment_ratio": (comment_lines / code_lines) if code_lines > 0 else 0,
+        "functions": functions_count,
+        "average_function_length": average_function_length,
+        "cyclomatic_complexity": complexity,
+        "is_test_file": is_test_file,
+        "test_functions": test_functions_count,
+    }
+
+
+def walk_repository(root_dir: str) -> List[Dict[str, object]]:
+    results = []
+    for dirpath, dirnames, filenames in os.walk(root_dir):
+        # Remove skipped directories from traversal
+        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
+        for filename in filenames:
+            ext = os.path.splitext(filename)[1]
+            if ext in SOURCE_EXTENSIONS:
+                filepath = os.path.join(dirpath, filename)
+                metrics = analyse_file(filepath)
+                if metrics:
+                    results.append(metrics)
+    return results
+
+
+def aggregate_metrics(results: List[Dict[str, object]]) -> Dict[str, object]:
+
+    total_code_lines = sum(item["lines_of_code"] for item in results)
+    total_comment_lines = sum(item["comment_lines"] for item in results)
+    total_functions = sum(item["functions"] for item in results)
+    total_complexity = sum(item["cyclomatic_complexity"] for item in results)
+    total_files = len(results)
+    
+    total_function_lines = sum(
+        item["average_function_length"] * item["functions"] for item in results
+    )
+    average_function_length = (
+        total_function_lines / total_functions if total_functions > 0 else 0
+    )
+    comment_ratio = (
+        (total_comment_lines / total_code_lines) if total_code_lines > 0 else 0
+    )
+
+    test_files = [item for item in results if item["is_test_file"]]
+    total_test_files = len(test_files)
+    total_test_lines = sum(item["lines_of_code"] for item in test_files)
+    total_test_functions = sum(item["test_functions"] for item in test_files)
+    test_ratio = (
+        (total_test_lines / total_code_lines) if total_code_lines > 0 else 0
+    )
+
+    aggregated = {
+        "total_files": total_files,
+        "total_code_lines": total_code_lines,
+        "total_comment_lines": total_comment_lines,
+        "comment_ratio": comment_ratio,
+        "total_functions": total_functions,
+        "average_function_length": average_function_length,
+        "total_cyclomatic_complexity": total_complexity,
+        "total_test_files": total_test_files,
+        "total_test_lines": total_test_lines,
+        "total_test_functions": total_test_functions,
+        "test_ratio": test_ratio,
+    }
+    return aggregated
+
+
+def main() -> None:
+    results = walk_repository(ROOT_DIR)
+    aggregated = aggregate_metrics(results)
+    report = {
+        "files": results,
+        "summary": aggregated,
+    }
+    print(json.dumps(report, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/courseProjectCode/project-proposal.md b/courseProjectCode/project-proposal.md
@@ -0,0 +1,140 @@
+# Project Proposal
+
+## Project Overview
+
+Our course project aims to build a lightweight data analysis library that
+mimics essential features of the pandas ecosystem. The library will
+provide tabular data structures (similar to DataFrame and Series) and
+support common operations needed by scientists and engineers working
+with structured data. Major functional goals include:
+
+- **Handling missing data:** The system should represent missing values as
+  NaN, NA or NaT and propagate them through computations. This
+  capability simplifies data cleaning and statistical analysis by
+  preventing silent errors software.com.
+
+- **Size mutability:** Users should be able to insert or delete columns
+  and rows in data structures. Dynamic resizing is central to
+  interactive analysis workflows where the shape of a table evolves as
+  new information becomes available raw.githubusercontent.com.
+
+- **Automatic and explicit data alignment:** When performing
+  arithmetic or merging operations, the system will align data on
+  labels or allow users to opt out of alignment entirely. Proper
+  alignment prevents accidental mismatches and promotes reproducible
+  results raw.githubusercontent.com.
+
+- **Flexible group-by operations:** The library should implement
+  split–apply–combine patterns for aggregation, transformation, and
+  filtering so that users can summarise data by categories with a
+  single fluent expression raw.githubusercontent.com.
+
+- **Robust I/O tooling:** Data structures must load from and save to
+  common file formats (CSV, Excel) and efficiently persist to
+  high-performance formats such as HDF5 raw.githubusercontent.com.
+
+- **Time-series functionality:** Operations like date-range generation,
+  frequency conversion, moving-window statistics and date shifting will
+  be built in so that time-indexed data can be analysed without
+  external libraries raw.githubusercontent.com.
+
+In addition to these functional requirements, the project emphasises
+non-functional qualities such as performance, flexibility and
+expressive APIs. The goal is to provide an intuitive open-source tool
+that researchers can use to analyse data without sacrificing speed or
+power raw.githubusercontent.com.
+
+---
+
+## Key Quality Metrics
+
+To ensure that the implementation is maintainable and testable, we will
+track several quality metrics throughout the project lifecycle. The
+metrics were selected based on guidance from software engineering
+literature and industry best practices.
+
+### Maintainability metrics
+
+- **Maintainability index (MI):** Visual Studio defines an index from
+  0 to 100 that summarises the ease of maintaining a piece of code.
+  Higher values indicate more maintainable code, with scores above
+  20 considered “good,” 10–19 “moderate” and below 10 “poor”
+  learn.microsoft.com.  
+  MI combines several measurements such as cyclomatic complexity,
+  depth of inheritance and class coupling. Although we do not
+  compute MI directly, we monitor its constituent components to track
+  trends over time.
+
+- **Cyclomatic complexity:** This measures the number of linearly
+  independent paths through a program. Each decision point (e.g.,
+  if, for, while) adds one to the count. Higher complexity
+  indicates more potential execution paths and requires more tests to
+  achieve full coverage learn.microsoft.com. Our metrics script
+  approximates cyclomatic complexity by scanning for decision
+  keywords, providing a reproducible indicator of structural
+  complexity.
+
+- **Comment-to-code ratio:** The number of comment lines divided by
+  the number of executable lines software.com. Comments
+  capture design assumptions, edge cases and rationale that are not
+  obvious from code alone. A moderate ratio improves maintainability
+  by aiding knowledge transfer and reducing ramp-up time for new
+  contributors software.com. However, excessively high
+  ratios can reflect commented-out code or verbose documentation,
+  so the ratio should be interpreted in context software.com.
+
+- **Average function length:** Smaller functions tend to perform a
+  single task, are easier to understand and thus easier to modify.
+  The metrics script measures the average number of code lines per
+  function. Keeping this metric low encourages modular design and
+  aligns with the Single Responsibility Principle.
+
+- **Class coupling and depth of inheritance:** Although our project
+  uses primarily functions and data structures, we will monitor
+  coupling and inheritance depth where applicable. Visual Studio’s
+  guidance notes that high class coupling and deep inheritance trees
+  decrease maintainability learn.microsoft.com. We will
+  minimise dependencies between modules and favour composition over
+  inheritance to keep these metrics low.
+
+### Testability metrics
+
+- **Test coverage:** Atlassian describes code coverage as a measure
+  of how much of the code base is exercised by tests and notes
+  several metrics: function, statement, branch, condition and line
+  coverage atlassian.com. Although a high coverage
+  percentage does not guarantee good tests, it reveals which parts of
+  the system remain untested and helps to prioritise additional
+  testing efforts. Since we cannot run external coverage tools in
+  this environment, our metrics script approximates test effort by
+  reporting the ratio of lines in test files to total lines of code
+  and counting the number of test functions. Increasing the
+  test-to-code ratio over time should correlate with improved
+  coverage.
+
+- **Number of test cases:** We treat each test_* function in
+  Python and calls to describe/it in JavaScript as individual
+  test cases. Tracking the number of test cases encourages
+  developers to write focused, granular tests and highlights
+  subsystems that may need additional verification.
+
+- **Complexity vs. tests:** Cyclomatic complexity informs us how
+  many test cases are theoretically required to exercise all
+  execution paths learn.microsoft.com. By comparing the number
+  of test cases to the aggregate complexity of the code base, we can
+  judge whether testing is keeping pace with growing code
+  intricacy. If complexity rises faster than test counts, there may
+  be untested paths that warrant attention.
+
+---
+
+## Using the metrics
+
+The `metrics_collector.py` script in `courseProjectCode/Metrics/`
+implements the measurements described above. Running the script
+generates a JSON report containing per-file metrics and a summary.
+These metrics will form the basis of our quality dashboard and guide
+refactoring and testing priorities throughout the project. By
+monitoring comment ratios, function lengths, complexity and test
+ratios, we can make data-driven decisions to keep the code base
+maintainable and to ensure that behaviour is thoroughly validated.