Skip to content

Commit 8d0e5ef

Browse files
authored
Detached Partitioner from CVMatrix and updated documentation. (#9)
Detached the Partitioner from CVMatrix so that the dictionary of integer arrays with validation indices does not need to be pickled when using multiprocessing together with CVMatrix such as in https://github.com/sm00thix/ikpls
1 parent 5858392 commit 8d0e5ef

File tree

10 files changed

+680
-436
lines changed

10 files changed

+680
-436
lines changed

README.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ The `cvmatrix` software package now also features **weigthed matrix produts** $\
4040
> ```python
4141
> import numpy as np
4242
> from cvmatrix.cvmatrix import CVMatrix
43+
> from cvmatrix.partitioner import Partitioner
4344
>
4445
> N = 100 # Number of samples.
4546
> K = 50 # Number of features.
@@ -55,30 +56,34 @@ The `cvmatrix` software package now also features **weigthed matrix produts** $\
5556
>
5657
> # Instantiate CVMatrix
5758
> cvm = CVMatrix(
58-
> folds=folds,
5959
> center_X=True, # Cemter around the weighted mean of X.
6060
> center_Y=True, # Cemter around the weighted mean of Y.
6161
> scale_X=True, # Scale by the weighted standard deviation of X.
6262
> scale_Y=True, # Scale by the weighted standard deviation of Y.
6363
> )
64-
> # Fit on X and Y
64+
> # Fit on X, Y, and weights
6565
> cvm.fit(X=X, Y=Y, weights=weights)
66+
>
67+
> # Instantiate Partitioner
68+
> p = Partitioner(folds=folds)
69+
>
6670
> # Compute training set XTWX and/or XTWY for each fold
67-
> for fold in cvm.folds_dict:
71+
> for fold in p.folds_dict:
72+
> val_indices = p.get_validation_indices(fold)
6873
> # Get both XTWX, XTWY, and weighted statistics
69-
> result = cvm.training_XTX_XTY(fold)
74+
> result = cvm.training_XTX_XTY(val_indices)
7075
> (training_XTWX, training_XTWY) = result[0]
7176
> (training_X_mean, training_X_std, training_Y_mean, training_Y_std) = result[1]
7277
>
7378
> # Get only XTWX and weighted statistics for X.
7479
> # Weighted statistics for Y are returned as None as they are not computed when
7580
> # only XTWX is requested.
76-
> result = cvm.training_XTX(fold)
81+
> result = cvm.training_XTX(val_indices)
7782
> training_XTWX = result[0]
7883
> (training_X_mean, training_X_std, training_Y_mean, training_Y_std) = result[1]
7984
>
8085
> # Get only XTWY and weighted statistics
81-
> result = cvm.training_XTY(fold)
86+
> result = cvm.training_XTY(val_indices)
8287
> training_XTWY = result[0]
8388
> (training_X_mean, training_X_std, training_Y_mean, training_Y_std) = result[1]
8489

benchmarks/benchmark.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
compared for different values of P, center_X, center_Y, scale_X, and scale_Y. The
55
results are saved to a CSV file for further analysis.
66
7-
Engstrøm, O.-C. G. (2024):
8-
https://arxiv.org/abs/2401.13185
7+
O.-C. G. Engstrøm and M. H. Jensen (2025):
8+
https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/full/10.1002/cem.70008
99
1010
Author: Ole-Christian Galbo Engstrøm
11-
E-mail: ole.e@di.ku.dk
11+
E-mail: ocge@foss.dk
1212
"""
1313

1414
import os
@@ -28,20 +28,24 @@
2828

2929
from cvmatrix.__init__ import __version__
3030
from cvmatrix.cvmatrix import CVMatrix
31+
from cvmatrix.partitioner import Partitioner
3132
from tests.naive_cvmatrix import NaiveCVMatrix
3233

3334

3435
def save_result_to_csv(
35-
model, P, N, K, M, center_X, center_Y, scale_X, scale_Y, time, version
36+
model, use_weights, P, N, K, M, center_X, center_Y, scale_X, scale_Y, time, version
3637
):
3738
try:
3839
with open("benchmark_results.csv", "x") as f:
39-
f.write("model,P,N,K,M," "center_X,center_Y,scale_X,scale_Y,time,version\n")
40+
f.write(
41+
"model,weights,P,N,K,M,"
42+
"center_X,center_Y,scale_X,scale_Y,time,version\n"
43+
)
4044
except FileExistsError:
4145
pass
4246
with open("benchmark_results.csv", "a") as f:
4347
f.write(
44-
f"{model},{P},{N},{K},{M},"
48+
f"{model},{use_weights},{P},{N},{K},{M},"
4549
f"{center_X},{center_Y},{scale_X},{scale_Y},"
4650
f"{time},{version}\n"
4751
)
@@ -96,7 +100,6 @@ def execute_algorithm(
96100

97101
# Create the model
98102
model = model_class(
99-
folds=cv_splits,
100103
center_X=center_X,
101104
center_Y=center_Y,
102105
scale_X=scale_X,
@@ -105,12 +108,26 @@ def execute_algorithm(
105108
copy=True,
106109
)
107110

111+
# Create the validation partitioner
112+
p = Partitioner(folds=cv_splits)
113+
108114
# Fit the model
109115
model.fit(X, Y, weights)
110116

111-
# Compute the training set matrices
112-
for fold in model.folds_dict.keys():
113-
model.training_XTX_XTY(fold)
117+
if isinstance(model, NaiveCVMatrix):
118+
# Compute the training set matrices
119+
for fold in p.folds_dict:
120+
# Get the training indices for the current fold
121+
training_indices = np.concatenate(
122+
[p.get_validation_indices(f) for f in p.folds_dict if f != fold]
123+
)
124+
model.training_XTX_XTY(training_indices)
125+
else:
126+
# Compute the training set matrices
127+
for fold in p.folds_dict:
128+
# Get the validation indices for the current fold
129+
validation_indices = p.get_validation_indices(fold)
130+
model.training_XTX_XTY(validation_indices)
114131

115132

116133
if __name__ == "__main__":
@@ -122,19 +139,20 @@ def execute_algorithm(
122139
dtype = np.float64 # Data type
123140
X = rng.random((N, K), dtype=dtype) # Random X matrix
124141
Y = rng.random((N, M), dtype=dtype) # Random Y matrix
125-
# weights = rng.random((N,), dtype=dtype) # Random weights
126-
weights = None
142+
weights = rng.random((N,), dtype=dtype) # Random weights
127143
cv_splits = np.arange(N) # We can use mod P for P-fold cross-validation
144+
use_weights = [True, False] # Whether to use weights or not
128145
center_Xs = [True, False]
129146
center_Ys = [True, False]
130147
scale_Xs = [True, False]
131148
scale_Ys = [True, False]
132149
Ps = [3, 5, 10, 100, 1000, 10000, 100000]
133150

134-
for center_X, center_Y, scale_X, scale_Y, P in product(
135-
center_Xs, center_Ys, scale_Xs, scale_Ys, Ps
151+
for use_w, center_X, center_Y, scale_X, scale_Y, P in product(
152+
use_weights, center_Xs, center_Ys, scale_Xs, scale_Ys, Ps
136153
):
137154
print(
155+
f"weights={use_w}, "
138156
f"P={P}, "
139157
f"center_X={center_X}, center_Y={center_Y}, "
140158
f"scale_X={scale_X}, scale_Y={scale_Y}, "
@@ -149,13 +167,14 @@ def execute_algorithm(
149167
scale_Y=scale_Y,
150168
X=X,
151169
Y=Y,
152-
weights=weights,
170+
weights=weights if use_weights else None,
153171
),
154172
number=1,
155173
)
156174
print(f"CVMatrix, Time: {time:.2f} seconds")
157175
save_result_to_csv(
158176
"CVMatrix",
177+
use_w,
159178
P,
160179
N,
161180
K,
@@ -191,6 +210,7 @@ def execute_algorithm(
191210
print()
192211
save_result_to_csv(
193212
"NaiveCVMatrix",
213+
use_w,
194214
P,
195215
N,
196216
K,

cvmatrix/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.1.1"
1+
__version__ = "3.0.0"

0 commit comments

Comments
 (0)