Skip to content
Closed

Sandeeep #62887

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions courseProjectCode/Metrics/metrics_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@


import ast
import json
import os
import re
from typing import Dict, List, Tuple


ROOT_DIR = os.getcwd()

SKIP_DIRS = {
"node_modules",
"courseProjectDocs",
"courseProjectCode",
".git",
"__pycache__",
}

SOURCE_EXTENSIONS = {".py"}


def count_python_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
try:
tree = ast.parse(file_content)
except SyntaxError:
return 0, []

function_spans = []
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
# end_lineno is available in Python 3.8+
start_line = getattr(node, "lineno", None)
end_line = getattr(node, "end_lineno", None)
if start_line is not None and end_line is not None:
function_spans.append((start_line, end_line))
return len(function_spans), function_spans


def count_js_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
lines = file_content.splitlines()
count = 0
spans = []
for idx, line in enumerate(lines, start=1):
stripped = line.strip()
if stripped.startswith("//") or stripped.startswith("/*"):
continue
if re.search(r"\bfunction\b", stripped) or re.search(r"=>", stripped):
count += 1
spans.append((idx, idx))
return count, spans


def approximate_cyclomatic_complexity(lines: List[str]) -> int:
complexity = 1 # Base complexity
decision_keywords = [
"if ", "for ", "while ", "case ", "switch ", "catch ", "&&", "||", "?",
"elif ", "except ",
]
for line in lines:
stripped = line.strip()
if not stripped or stripped.startswith("#") or stripped.startswith("//"):
continue
for keyword in decision_keywords:
if keyword in stripped:
complexity += 1
break
return complexity


def analyse_file(filepath: str) -> Dict[str, object]:
try:
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
except (OSError, UnicodeDecodeError):
return {}

lines = content.splitlines()
code_lines = 0
comment_lines = 0
in_block_comment = False

for line in lines:
stripped = line.strip()
if not stripped:
continue
if in_block_comment:
comment_lines += 1
if "*/" in stripped:
in_block_comment = False
continue
if stripped.startswith("/*"):
comment_lines += 1
if "*/" not in stripped:
in_block_comment = True
continue
if stripped.startswith("#") or stripped.startswith("//"):
comment_lines += 1
continue
if stripped.startswith("\"\"\""):
comment_lines += 1
continue
code_lines += 1

ext = os.path.splitext(filepath)[1]
functions_count = 0
function_spans: List[Tuple[int, int]] = []
if ext == ".py":
functions_count, function_spans = count_python_functions(content)
elif ext == ".js":
functions_count, function_spans = count_js_functions(content)

total_function_lines = 0
for start, end in function_spans:
if end >= start:
total_function_lines += end - start + 1
average_function_length = (
(total_function_lines / functions_count) if functions_count > 0 else 0
)

complexity = approximate_cyclomatic_complexity(lines)

parts = filepath.lower().split(os.sep)
is_test_file = any(
part.startswith("test") for part in parts if part not in {"", "."}
)

test_functions_count = 0
if is_test_file:
if ext == ".py":
try:
tree = ast.parse(content)
except SyntaxError:
tree = None
if tree is not None:
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
if node.name.startswith("test"):
test_functions_count += 1
elif ext == ".js":
test_functions_count = len(re.findall(r"\b(it|describe)\s*\(", content))

return {
"file": filepath,
"lines_of_code": code_lines,
"comment_lines": comment_lines,
"comment_ratio": (comment_lines / code_lines) if code_lines > 0 else 0,
"functions": functions_count,
"average_function_length": average_function_length,
"cyclomatic_complexity": complexity,
"is_test_file": is_test_file,
"test_functions": test_functions_count,
}


def walk_repository(root_dir: str) -> List[Dict[str, object]]:
results = []
for dirpath, dirnames, filenames in os.walk(root_dir):
# Remove skipped directories from traversal
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
for filename in filenames:
ext = os.path.splitext(filename)[1]
if ext in SOURCE_EXTENSIONS:
filepath = os.path.join(dirpath, filename)
metrics = analyse_file(filepath)
if metrics:
results.append(metrics)
return results


def aggregate_metrics(results: List[Dict[str, object]]) -> Dict[str, object]:

total_code_lines = sum(item["lines_of_code"] for item in results)
total_comment_lines = sum(item["comment_lines"] for item in results)
total_functions = sum(item["functions"] for item in results)
total_complexity = sum(item["cyclomatic_complexity"] for item in results)
total_files = len(results)

total_function_lines = sum(
item["average_function_length"] * item["functions"] for item in results
)
average_function_length = (
total_function_lines / total_functions if total_functions > 0 else 0
)
comment_ratio = (
(total_comment_lines / total_code_lines) if total_code_lines > 0 else 0
)

test_files = [item for item in results if item["is_test_file"]]
total_test_files = len(test_files)
total_test_lines = sum(item["lines_of_code"] for item in test_files)
total_test_functions = sum(item["test_functions"] for item in test_files)
test_ratio = (
(total_test_lines / total_code_lines) if total_code_lines > 0 else 0
)

aggregated = {
"total_files": total_files,
"total_code_lines": total_code_lines,
"total_comment_lines": total_comment_lines,
"comment_ratio": comment_ratio,
"total_functions": total_functions,
"average_function_length": average_function_length,
"total_cyclomatic_complexity": total_complexity,
"total_test_files": total_test_files,
"total_test_lines": total_test_lines,
"total_test_functions": total_test_functions,
"test_ratio": test_ratio,
}
return aggregated


def main() -> None:
results = walk_repository(ROOT_DIR)
aggregated = aggregate_metrics(results)
report = {
"files": results,
"summary": aggregated,
}
print(json.dumps(report, indent=2))


if __name__ == "__main__":
main()
140 changes: 140 additions & 0 deletions courseProjectCode/project-proposal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Project Proposal

## Project Overview

Our course project aims to build a lightweight data analysis library that
mimics essential features of the pandas ecosystem. The library will
provide tabular data structures (similar to DataFrame and Series) and
support common operations needed by scientists and engineers working
with structured data. Major functional goals include:

- **Handling missing data:** The system should represent missing values as
NaN, NA or NaT and propagate them through computations. This
capability simplifies data cleaning and statistical analysis by
preventing silent errors software.com.

- **Size mutability:** Users should be able to insert or delete columns
and rows in data structures. Dynamic resizing is central to
interactive analysis workflows where the shape of a table evolves as
new information becomes available raw.githubusercontent.com.

- **Automatic and explicit data alignment:** When performing
arithmetic or merging operations, the system will align data on
labels or allow users to opt out of alignment entirely. Proper
alignment prevents accidental mismatches and promotes reproducible
results raw.githubusercontent.com.

- **Flexible group-by operations:** The library should implement
split–apply–combine patterns for aggregation, transformation, and
filtering so that users can summarise data by categories with a
single fluent expression raw.githubusercontent.com.

- **Robust I/O tooling:** Data structures must load from and save to
common file formats (CSV, Excel) and efficiently persist to
high-performance formats such as HDF5 raw.githubusercontent.com.

- **Time-series functionality:** Operations like date-range generation,
frequency conversion, moving-window statistics and date shifting will
be built in so that time-indexed data can be analysed without
external libraries raw.githubusercontent.com.

In addition to these functional requirements, the project emphasises
non-functional qualities such as performance, flexibility and
expressive APIs. The goal is to provide an intuitive open-source tool
that researchers can use to analyse data without sacrificing speed or
power raw.githubusercontent.com.

---

## Key Quality Metrics

To ensure that the implementation is maintainable and testable, we will
track several quality metrics throughout the project lifecycle. The
metrics were selected based on guidance from software engineering
literature and industry best practices.

### Maintainability metrics

- **Maintainability index (MI):** Visual Studio defines an index from
0 to 100 that summarises the ease of maintaining a piece of code.
Higher values indicate more maintainable code, with scores above
20 considered “good,” 10–19 “moderate” and below 10 “poor”
learn.microsoft.com.
MI combines several measurements such as cyclomatic complexity,
depth of inheritance and class coupling. Although we do not
compute MI directly, we monitor its constituent components to track
trends over time.

- **Cyclomatic complexity:** This measures the number of linearly
independent paths through a program. Each decision point (e.g.,
if, for, while) adds one to the count. Higher complexity
indicates more potential execution paths and requires more tests to
achieve full coverage learn.microsoft.com. Our metrics script
approximates cyclomatic complexity by scanning for decision
keywords, providing a reproducible indicator of structural
complexity.

- **Comment-to-code ratio:** The number of comment lines divided by
the number of executable lines software.com. Comments
capture design assumptions, edge cases and rationale that are not
obvious from code alone. A moderate ratio improves maintainability
by aiding knowledge transfer and reducing ramp-up time for new
contributors software.com. However, excessively high
ratios can reflect commented-out code or verbose documentation,
so the ratio should be interpreted in context software.com.

- **Average function length:** Smaller functions tend to perform a
single task, are easier to understand and thus easier to modify.
The metrics script measures the average number of code lines per
function. Keeping this metric low encourages modular design and
aligns with the Single Responsibility Principle.

- **Class coupling and depth of inheritance:** Although our project
uses primarily functions and data structures, we will monitor
coupling and inheritance depth where applicable. Visual Studio’s
guidance notes that high class coupling and deep inheritance trees
decrease maintainability learn.microsoft.com. We will
minimise dependencies between modules and favour composition over
inheritance to keep these metrics low.

### Testability metrics

- **Test coverage:** Atlassian describes code coverage as a measure
of how much of the code base is exercised by tests and notes
several metrics: function, statement, branch, condition and line
coverage atlassian.com. Although a high coverage
percentage does not guarantee good tests, it reveals which parts of
the system remain untested and helps to prioritise additional
testing efforts. Since we cannot run external coverage tools in
this environment, our metrics script approximates test effort by
reporting the ratio of lines in test files to total lines of code
and counting the number of test functions. Increasing the
test-to-code ratio over time should correlate with improved
coverage.

- **Number of test cases:** We treat each test_* function in
Python and calls to describe/it in JavaScript as individual
test cases. Tracking the number of test cases encourages
developers to write focused, granular tests and highlights
subsystems that may need additional verification.

- **Complexity vs. tests:** Cyclomatic complexity informs us how
many test cases are theoretically required to exercise all
execution paths learn.microsoft.com. By comparing the number
of test cases to the aggregate complexity of the code base, we can
judge whether testing is keeping pace with growing code
intricacy. If complexity rises faster than test counts, there may
be untested paths that warrant attention.

---

## Using the metrics

The `metrics_collector.py` script in `courseProjectCode/Metrics/`
implements the measurements described above. Running the script
generates a JSON report containing per-file metrics and a summary.
These metrics will form the basis of our quality dashboard and guide
refactoring and testing priorities throughout the project. By
monitoring comment ratios, function lengths, complexity and test
ratios, we can make data-driven decisions to keep the code base
maintainable and to ensure that behaviour is thoroughly validated.
Binary file added courseProjectDocs/Setup/Coverage report.pdf
Binary file not shown.
Loading
Loading