Skip to content

Commit 603f06f

Browse files
Added project proposal and quality metrics
1 parent ac5d1d3 commit 603f06f

File tree

2 files changed

+364
-0
lines changed

2 files changed

+364
-0
lines changed
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
2+
3+
import ast
4+
import json
5+
import os
6+
import re
7+
from typing import Dict, List, Tuple
8+
9+
10+
ROOT_DIR = os.getcwd()
11+
12+
SKIP_DIRS = {
13+
"node_modules",
14+
"courseProjectDocs",
15+
"courseProjectCode",
16+
".git",
17+
"__pycache__",
18+
}
19+
20+
SOURCE_EXTENSIONS = {".py"}
21+
22+
23+
def count_python_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
24+
try:
25+
tree = ast.parse(file_content)
26+
except SyntaxError:
27+
return 0, []
28+
29+
function_spans = []
30+
for node in ast.walk(tree):
31+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
32+
# end_lineno is available in Python 3.8+
33+
start_line = getattr(node, "lineno", None)
34+
end_line = getattr(node, "end_lineno", None)
35+
if start_line is not None and end_line is not None:
36+
function_spans.append((start_line, end_line))
37+
return len(function_spans), function_spans
38+
39+
40+
def count_js_functions(file_content: str) -> Tuple[int, List[Tuple[int, int]]]:
41+
lines = file_content.splitlines()
42+
count = 0
43+
spans = []
44+
for idx, line in enumerate(lines, start=1):
45+
stripped = line.strip()
46+
if stripped.startswith("//") or stripped.startswith("/*"):
47+
continue
48+
if re.search(r"\bfunction\b", stripped) or re.search(r"=>", stripped):
49+
count += 1
50+
spans.append((idx, idx))
51+
return count, spans
52+
53+
54+
def approximate_cyclomatic_complexity(lines: List[str]) -> int:
55+
complexity = 1 # Base complexity
56+
decision_keywords = [
57+
"if ", "for ", "while ", "case ", "switch ", "catch ", "&&", "||", "?",
58+
"elif ", "except ",
59+
]
60+
for line in lines:
61+
stripped = line.strip()
62+
if not stripped or stripped.startswith("#") or stripped.startswith("//"):
63+
continue
64+
for keyword in decision_keywords:
65+
if keyword in stripped:
66+
complexity += 1
67+
break
68+
return complexity
69+
70+
71+
def analyse_file(filepath: str) -> Dict[str, object]:
72+
try:
73+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
74+
content = f.read()
75+
except (OSError, UnicodeDecodeError):
76+
return {}
77+
78+
lines = content.splitlines()
79+
code_lines = 0
80+
comment_lines = 0
81+
in_block_comment = False
82+
83+
for line in lines:
84+
stripped = line.strip()
85+
if not stripped:
86+
continue
87+
if in_block_comment:
88+
comment_lines += 1
89+
if "*/" in stripped:
90+
in_block_comment = False
91+
continue
92+
if stripped.startswith("/*"):
93+
comment_lines += 1
94+
if "*/" not in stripped:
95+
in_block_comment = True
96+
continue
97+
if stripped.startswith("#") or stripped.startswith("//"):
98+
comment_lines += 1
99+
continue
100+
if stripped.startswith("\"\"\""):
101+
comment_lines += 1
102+
continue
103+
code_lines += 1
104+
105+
ext = os.path.splitext(filepath)[1]
106+
functions_count = 0
107+
function_spans: List[Tuple[int, int]] = []
108+
if ext == ".py":
109+
functions_count, function_spans = count_python_functions(content)
110+
elif ext == ".js":
111+
functions_count, function_spans = count_js_functions(content)
112+
113+
total_function_lines = 0
114+
for start, end in function_spans:
115+
if end >= start:
116+
total_function_lines += end - start + 1
117+
average_function_length = (
118+
(total_function_lines / functions_count) if functions_count > 0 else 0
119+
)
120+
121+
complexity = approximate_cyclomatic_complexity(lines)
122+
123+
parts = filepath.lower().split(os.sep)
124+
is_test_file = any(
125+
part.startswith("test") for part in parts if part not in {"", "."}
126+
)
127+
128+
test_functions_count = 0
129+
if is_test_file:
130+
if ext == ".py":
131+
try:
132+
tree = ast.parse(content)
133+
except SyntaxError:
134+
tree = None
135+
if tree is not None:
136+
for node in ast.walk(tree):
137+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
138+
if node.name.startswith("test"):
139+
test_functions_count += 1
140+
elif ext == ".js":
141+
test_functions_count = len(re.findall(r"\b(it|describe)\s*\(", content))
142+
143+
return {
144+
"file": filepath,
145+
"lines_of_code": code_lines,
146+
"comment_lines": comment_lines,
147+
"comment_ratio": (comment_lines / code_lines) if code_lines > 0 else 0,
148+
"functions": functions_count,
149+
"average_function_length": average_function_length,
150+
"cyclomatic_complexity": complexity,
151+
"is_test_file": is_test_file,
152+
"test_functions": test_functions_count,
153+
}
154+
155+
156+
def walk_repository(root_dir: str) -> List[Dict[str, object]]:
157+
results = []
158+
for dirpath, dirnames, filenames in os.walk(root_dir):
159+
# Remove skipped directories from traversal
160+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
161+
for filename in filenames:
162+
ext = os.path.splitext(filename)[1]
163+
if ext in SOURCE_EXTENSIONS:
164+
filepath = os.path.join(dirpath, filename)
165+
metrics = analyse_file(filepath)
166+
if metrics:
167+
results.append(metrics)
168+
return results
169+
170+
171+
def aggregate_metrics(results: List[Dict[str, object]]) -> Dict[str, object]:
172+
173+
total_code_lines = sum(item["lines_of_code"] for item in results)
174+
total_comment_lines = sum(item["comment_lines"] for item in results)
175+
total_functions = sum(item["functions"] for item in results)
176+
total_complexity = sum(item["cyclomatic_complexity"] for item in results)
177+
total_files = len(results)
178+
179+
total_function_lines = sum(
180+
item["average_function_length"] * item["functions"] for item in results
181+
)
182+
average_function_length = (
183+
total_function_lines / total_functions if total_functions > 0 else 0
184+
)
185+
comment_ratio = (
186+
(total_comment_lines / total_code_lines) if total_code_lines > 0 else 0
187+
)
188+
189+
test_files = [item for item in results if item["is_test_file"]]
190+
total_test_files = len(test_files)
191+
total_test_lines = sum(item["lines_of_code"] for item in test_files)
192+
total_test_functions = sum(item["test_functions"] for item in test_files)
193+
test_ratio = (
194+
(total_test_lines / total_code_lines) if total_code_lines > 0 else 0
195+
)
196+
197+
aggregated = {
198+
"total_files": total_files,
199+
"total_code_lines": total_code_lines,
200+
"total_comment_lines": total_comment_lines,
201+
"comment_ratio": comment_ratio,
202+
"total_functions": total_functions,
203+
"average_function_length": average_function_length,
204+
"total_cyclomatic_complexity": total_complexity,
205+
"total_test_files": total_test_files,
206+
"total_test_lines": total_test_lines,
207+
"total_test_functions": total_test_functions,
208+
"test_ratio": test_ratio,
209+
}
210+
return aggregated
211+
212+
213+
def main() -> None:
214+
results = walk_repository(ROOT_DIR)
215+
aggregated = aggregate_metrics(results)
216+
report = {
217+
"files": results,
218+
"summary": aggregated,
219+
}
220+
print(json.dumps(report, indent=2))
221+
222+
223+
if __name__ == "__main__":
224+
main()
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Project Proposal
2+
3+
## Project Overview
4+
5+
Our course project aims to build a lightweight data analysis library that
6+
mimics essential features of the pandas ecosystem. The library will
7+
provide tabular data structures (similar to DataFrame and Series) and
8+
support common operations needed by scientists and engineers working
9+
with structured data. Major functional goals include:
10+
11+
- **Handling missing data:** The system should represent missing values as
12+
NaN, NA or NaT and propagate them through computations. This
13+
capability simplifies data cleaning and statistical analysis by
14+
preventing silent errors software.com.
15+
16+
- **Size mutability:** Users should be able to insert or delete columns
17+
and rows in data structures. Dynamic resizing is central to
18+
interactive analysis workflows where the shape of a table evolves as
19+
new information becomes available raw.githubusercontent.com.
20+
21+
- **Automatic and explicit data alignment:** When performing
22+
arithmetic or merging operations, the system will align data on
23+
labels or allow users to opt out of alignment entirely. Proper
24+
alignment prevents accidental mismatches and promotes reproducible
25+
results raw.githubusercontent.com.
26+
27+
- **Flexible group-by operations:** The library should implement
28+
split–apply–combine patterns for aggregation, transformation, and
29+
filtering so that users can summarise data by categories with a
30+
single fluent expression raw.githubusercontent.com.
31+
32+
- **Robust I/O tooling:** Data structures must load from and save to
33+
common file formats (CSV, Excel) and efficiently persist to
34+
high-performance formats such as HDF5 raw.githubusercontent.com.
35+
36+
- **Time-series functionality:** Operations like date-range generation,
37+
frequency conversion, moving-window statistics and date shifting will
38+
be built in so that time-indexed data can be analysed without
39+
external libraries raw.githubusercontent.com.
40+
41+
In addition to these functional requirements, the project emphasises
42+
non-functional qualities such as performance, flexibility and
43+
expressive APIs. The goal is to provide an intuitive open-source tool
44+
that researchers can use to analyse data without sacrificing speed or
45+
power raw.githubusercontent.com.
46+
47+
---
48+
49+
## Key Quality Metrics
50+
51+
To ensure that the implementation is maintainable and testable, we will
52+
track several quality metrics throughout the project lifecycle. The
53+
metrics were selected based on guidance from software engineering
54+
literature and industry best practices.
55+
56+
### Maintainability metrics
57+
58+
- **Maintainability index (MI):** Visual Studio defines an index from
59+
0 to 100 that summarises the ease of maintaining a piece of code.
60+
Higher values indicate more maintainable code, with scores above
61+
20 considered “good,” 10–19 “moderate” and below 10 “poor”
62+
learn.microsoft.com.
63+
MI combines several measurements such as cyclomatic complexity,
64+
depth of inheritance and class coupling. Although we do not
65+
compute MI directly, we monitor its constituent components to track
66+
trends over time.
67+
68+
- **Cyclomatic complexity:** This measures the number of linearly
69+
independent paths through a program. Each decision point (e.g.,
70+
if, for, while) adds one to the count. Higher complexity
71+
indicates more potential execution paths and requires more tests to
72+
achieve full coverage learn.microsoft.com. Our metrics script
73+
approximates cyclomatic complexity by scanning for decision
74+
keywords, providing a reproducible indicator of structural
75+
complexity.
76+
77+
- **Comment-to-code ratio:** The number of comment lines divided by
78+
the number of executable lines software.com. Comments
79+
capture design assumptions, edge cases and rationale that are not
80+
obvious from code alone. A moderate ratio improves maintainability
81+
by aiding knowledge transfer and reducing ramp-up time for new
82+
contributors software.com. However, excessively high
83+
ratios can reflect commented-out code or verbose documentation,
84+
so the ratio should be interpreted in context software.com.
85+
86+
- **Average function length:** Smaller functions tend to perform a
87+
single task, are easier to understand and thus easier to modify.
88+
The metrics script measures the average number of code lines per
89+
function. Keeping this metric low encourages modular design and
90+
aligns with the Single Responsibility Principle.
91+
92+
- **Class coupling and depth of inheritance:** Although our project
93+
uses primarily functions and data structures, we will monitor
94+
coupling and inheritance depth where applicable. Visual Studio’s
95+
guidance notes that high class coupling and deep inheritance trees
96+
decrease maintainability learn.microsoft.com. We will
97+
minimise dependencies between modules and favour composition over
98+
inheritance to keep these metrics low.
99+
100+
### Testability metrics
101+
102+
- **Test coverage:** Atlassian describes code coverage as a measure
103+
of how much of the code base is exercised by tests and notes
104+
several metrics: function, statement, branch, condition and line
105+
coverage atlassian.com. Although a high coverage
106+
percentage does not guarantee good tests, it reveals which parts of
107+
the system remain untested and helps to prioritise additional
108+
testing efforts. Since we cannot run external coverage tools in
109+
this environment, our metrics script approximates test effort by
110+
reporting the ratio of lines in test files to total lines of code
111+
and counting the number of test functions. Increasing the
112+
test-to-code ratio over time should correlate with improved
113+
coverage.
114+
115+
- **Number of test cases:** We treat each test_* function in
116+
Python and calls to describe/it in JavaScript as individual
117+
test cases. Tracking the number of test cases encourages
118+
developers to write focused, granular tests and highlights
119+
subsystems that may need additional verification.
120+
121+
- **Complexity vs. tests:** Cyclomatic complexity informs us how
122+
many test cases are theoretically required to exercise all
123+
execution paths learn.microsoft.com. By comparing the number
124+
of test cases to the aggregate complexity of the code base, we can
125+
judge whether testing is keeping pace with growing code
126+
intricacy. If complexity rises faster than test counts, there may
127+
be untested paths that warrant attention.
128+
129+
---
130+
131+
## Using the metrics
132+
133+
The `metrics_collector.py` script in `courseProjectCode/Metrics/`
134+
implements the measurements described above. Running the script
135+
generates a JSON report containing per-file metrics and a summary.
136+
These metrics will form the basis of our quality dashboard and guide
137+
refactoring and testing priorities throughout the project. By
138+
monitoring comment ratios, function lengths, complexity and test
139+
ratios, we can make data-driven decisions to keep the code base
140+
maintainable and to ensure that behaviour is thoroughly validated.

0 commit comments

Comments
 (0)