Exposed get_validation_indices and added an LOOCV test case (#8)

sm00thix · web-flow · commit 585839272a30 · 2025-07-08T14:06:52.000+02:00
Co-authored-by: Ole-Christian Galbo Engstrøm &lt;ocge@foss.dk&gt;
diff --git a/cvmatrix/__init__.py b/cvmatrix/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.0.post1"
+__version__ = "2.1.1"
diff --git a/cvmatrix/cvmatrix.py b/cvmatrix/cvmatrix.py
@@ -27,7 +27,8 @@ class CVMatrix:
     ----------
     folds : Iterable of Hashable with N elements
         An iterable defining cross-validation splits. Each unique value in
-        `folds` corresponds to a different fold.
+        `folds` corresponds to a different fold. The validation indices for each fold
+        can be accessed using the `get_validation_indices` method.
 
     center_X : bool, optional, default=True
         Whether to center `X` before computation of
@@ -348,7 +349,7 @@ def training_statistics(self, fold: Hashable) -> Tuple[
             If `fold` was not provided as a cross-validation split in the
             `folds` parameter of the constructor.
         """
-        val_indices = self._get_val_indices(fold)
+        val_indices = self.get_validation_indices(fold)
         X_val, X_val_unweighted, Y_val, Y_val_unweighted = self._get_val_matrices(
             val_indices=val_indices, return_XTY=self.Y_total is not None
         )
@@ -366,6 +367,32 @@ def training_statistics(self, fold: Hashable) -> Tuple[
             :-1
         ]  # Exclude the sum of training weights from the return tuple
 
+    def get_validation_indices(self, fold: Hashable) -> npt.NDArray[np.int_]:
+        """
+        Returns the indices of the validation set samples for a given fold.
+
+        Parameters
+        ----------
+        fold : Hashable
+            The fold for which to return the validation set indices.
+
+        Returns
+        -------
+        Array of shape (N_val,)
+            The indices of the validation set samples for the given fold.
+
+        Raises
+        ------
+        ValueError
+            If `fold` was not provided as a cross-validation split in the
+            `folds` parameter of the constructor.
+        """
+        try:
+            val_indices = self.folds_dict[fold]
+        except KeyError as e:
+            raise ValueError(f"Fold {fold} not found.") from e
+        return val_indices
+
     def _get_sum_w_train_and_num_nonzero_w_train(
         self, val_indices: npt.NDArray[np.int_]
     ) -> Tuple[float, float]:
@@ -589,7 +616,7 @@ def _training_matrices(
             )
         if return_XTY and self.Y_total is None:
             raise ValueError("Response variables `Y` are not provided.")
-        val_indices = self._get_val_indices(fold)
+        val_indices = self.get_validation_indices(fold)
         X_val, X_val_unweighted, Y_val, Y_val_unweighted = self._get_val_matrices(
             val_indices=val_indices, return_XTY=return_XTY
         )
@@ -680,24 +707,6 @@ def _training_matrices(
             stats_tuple,
         )
 
-    def _get_val_indices(self, fold: Hashable) -> npt.NDArray[np.int_]:
-        """
-        Returns the indices of the validation set samples for a given fold.
-        Parameters
-        ----------
-        fold : Hashable
-            The fold for which to return the validation set indices.
-        Returns
-        -------
-        Array of shape (N_val,)
-            The indices of the validation set samples for the given fold.
-        """
-        try:
-            val_indices = self.folds_dict[fold]
-        except KeyError as e:
-            raise ValueError(f"Fold {fold} not found.") from e
-        return val_indices
-
     def _get_val_matrices(
         self, val_indices: npt.NDArray[np.int_], return_XTY: bool
     ) -> Tuple[
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cvmatrix"
-version = "2.1.0.post1"
+version = "2.1.1"
 description = "Fast computation of possibly weighted and possibly centered/scaled training set kernel matrices in a cross-validation setting."
 authors = ["Sm00thix <oleemail@icloud.com>"]
 maintainers = ["Sm00thix <oleemail@icloud.com>"]
diff --git a/tests/test_cvmatrix.py b/tests/test_cvmatrix.py
@@ -1139,3 +1139,46 @@ def test_statistics_cvmatrix_methods(self):
                     err_msg="Statistics from training_statistics and "
                     "training_XTX methods are not equivalent." + diagnostic_msg,
                 )
+
+    def test_loocv(self):
+        """
+        Tests if the matrices computed by the NaiveCVMatrix and CVMatrix models are
+        equivalent when using Leave-One-Out Cross-Validation (LOOCV).
+        """
+        X = self.load_X()[:, :5]
+        Ys = [None, self.load_Y(["Protein", "Moisture"])]
+        folds = np.arange(X.shape[0])
+        center_Xs = [False, True]
+        center_Ys = [False, True]
+        scale_Xs = [False, True]
+        scale_Ys = [False, True]
+        ddofs = [0, 1]
+        use_weights = [False, True]
+        for center_X, center_Y, scale_X, scale_Y, use_w, ddof, Y in product(
+            center_Xs, center_Ys, scale_Xs, scale_Ys, use_weights, ddofs, Ys
+        ):
+            diagnostic_msg = (
+                f"center_X: {center_X}, center_Y: {center_Y}, "
+                f"scale_X: {scale_X}, scale_Y: {scale_Y}, "
+                f"ddof: {ddof}, use_weights: {use_w}, use_Y: {Y is not None}"
+            )
+            if use_w:
+                weights = self.randomly_zero_weights(self.load_weights(random=True))
+            else:
+                weights = None
+            naive, fast = self.fit_models(
+                X,
+                Y,
+                weights,
+                folds,
+                center_X,
+                center_Y,
+                scale_X,
+                scale_Y,
+                ddof,
+                np.float64,
+            )
+            print(diagnostic_msg)
+            # Extract 20 unique folds from the folds array.
+            subset_folds = np.random.choice(np.unique(folds), size=20, replace=False)
+            self.check_equivalent_matrices(naive, fast, subset_folds)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "2.1.0.post1"`
	`1`	`+__version__ = "2.1.1"`