Skip to content

Commit 5d0902a

Browse files
committed
Refactor GCV implementation
1 parent c98e7f1 commit 5d0902a

File tree

1 file changed

+61
-35
lines changed

1 file changed

+61
-35
lines changed

csaps/_sspumv.py

Lines changed: 61 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@ def breaks(self) -> np.ndarray:
4040
@property
4141
def coeffs(self) -> np.ndarray:
4242
return self.c
43-
44-
@property
45-
def gcv(self) -> float:
46-
return self.gcv
4743

4844
@property
4945
def order(self) -> int:
@@ -53,6 +49,10 @@ def order(self) -> int:
5349
def pieces(self) -> int:
5450
return self.c.shape[1]
5551

52+
@property
53+
def gcv(self) -> float:
54+
return self.gcv
55+
5656
@property
5757
def ndim(self) -> int:
5858
"""Returns the number of spline dimensions
@@ -83,6 +83,7 @@ def __repr__(self): # pragma: no cover
8383
f' pieces: {self.pieces}\n'
8484
f' order: {self.order}\n'
8585
f' ndim: {self.ndim}\n'
86+
f' degrees of freedom: {self.degrees_of_freedom}\n'
8687
f' gcv: {self.gcv}\n'
8788

8889
)
@@ -126,18 +127,20 @@ class CubicSmoothingSpline(ISmoothingSpline[
126127
and less sensitive to nonuniformity of weights and xdata clumping
127128
128129
.. versionadded:: 1.1.0
129-
130-
calculate_gcv : [*Optional*] bool
131-
If True, the Generalized Cross Validation criterion value will be calculated for the spline upon creation.
132-
The GCV can be minimized to attempt to identify an optimal smoothing parameter, while penalizing over fitting.
133-
Strictly speaking this involves generating splines for all smoothing parameter values across the valid domain
134-
of the smoothing parameter [0,1] then selecting the smoothing parameter that produces the lowest GCV.
135-
[See GCV in TEOSL (page 244 section 7.52)](https://hastie.su.domains/ElemStatLearn/printings/ESLII_print12_toc.pdf) for more information on methodology.
136-
The simplest way to use the GCV is to setup a loop that generates a spline with your data at every step, a step size
137-
of 0.001 is generally sufficient, and keep track of the spline that produces the lowest GCV. Setting this parameter to True,
138-
does _not_ generate the lowest GCV, it simply generates the value for the spline with this specific smoothing parameter,
139-
so that you may decide how to optimize it for yourself.
140-
130+
131+
calculate_degrees_of_freedom : [*Optional*] bool
132+
If True, the degrees of freedom for the spline will be calculated and set as a property on the returned
133+
spline object. Typically the degrees of freedom may be used to calculate the Generalized Cross Validation
134+
criterion. The GCV can be minimized to attempt to identify an optimal smoothing parameter, while penalizing
135+
over fitting.
136+
Strictly speaking this involves generating splines for all smoothing parameter values across the valid domain
137+
of the smoothing parameter [0,1] then selecting the smoothing parameter that produces the lowest GCV. See
138+
[GCV in TEOSL (page 244 section 7.52)](https://hastie.su.domains/ElemStatLearn/printings/ESLII_print12_toc.pdf)
139+
for more information on methodology.
140+
The simplest way to use the GCV is to setup a loop that generates a spline with your data at every step, a step
141+
size of 0.001 is generally sufficient, and keep track of the spline that produces the lowest GCV. Setting this
142+
parameter to True, does _not_ generate the lowest GCV, it simply sets the neccesary dependencies to use the
143+
calculate_gcv function. You must still compute the GCV for each smoothing parameter.
141144
142145
"""
143146

@@ -150,17 +153,13 @@ def __init__(self,
150153
smooth: Optional[float] = None,
151154
axis: int = -1,
152155
normalizedsmooth: bool = False,
153-
calculate_gcv: bool = False):
156+
calculate_degrees_of_freedom: bool = False):
154157

155158
x, y, w, shape, axis = self._prepare_data(xdata, ydata, weights, axis)
156-
if calculate_gcv:
157-
coeffs, smooth, gcv = self._make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_gcv)
158-
self.gcv = gcv
159-
else:
160-
coeffs, smooth = self._make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_gcv)
161-
self.gcv = None
159+
coeffs, smooth, degrees_of_freedom = self._make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_degrees_of_freedom)
162160
spline = SplinePPForm.construct_fast(coeffs, x, axis=axis)
163-
161+
self.degrees_of_freedom = degrees_of_freedom
162+
self.gcv = None
164163
self._smooth = smooth
165164
self._spline = spline
166165

@@ -220,6 +219,33 @@ def spline(self) -> SplinePPForm:
220219
"""
221220
return self._spline
222221

222+
def compute_gcv(self, y, y_pred) -> float:
223+
"""Computes the GCV using the degrees of freedom, which must be computed upon spline creation
224+
225+
Parameters
226+
----------
227+
228+
y : 1-d array-like
229+
Points the spline was evaluated at
230+
231+
y_pred : 1-d array-like
232+
Predicted values returned from the generated spline object
233+
234+
Returns
235+
-------
236+
gcv : float
237+
The generalized cross validation criterion
238+
239+
240+
"""
241+
if not self.degrees_of_freedom:
242+
raise ValueError("You must recreate the spline with the calculate_degrees_of_freedom parameter set to True")
243+
244+
y = np.asarray(y, dtype=np.float64)
245+
y_pred = np.asarray(y_pred, dtype=np.float64)
246+
self.gcv: float = np.linalg.norm(y-y_pred)**2 / (y.size - self.degrees_of_freedom)**2
247+
return self.gcv
248+
223249
@staticmethod
224250
def _prepare_data(xdata, ydata, weights, axis):
225251
xdata = np.asarray(xdata, dtype=np.float64)
@@ -285,7 +311,7 @@ def _normalize_smooth(x: np.ndarray, w: np.ndarray, smooth: Optional[float]):
285311
return p
286312

287313
@staticmethod
288-
def _make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_gcv):
314+
def _make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_degrees_of_freedom):
289315
pcount = x.size
290316
dx = np.diff(x)
291317

@@ -314,10 +340,10 @@ def _make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_gcv):
314340
dx_recip = 1. / dx
315341
diags_qtw = np.vstack((dx_recip[:-1], -(dx_recip[1:] + dx_recip[:-1]), dx_recip[1:]))
316342
diags_sqrw_recip = 1. / np.sqrt(w)
317-
343+
318344
# Calculate and store qt separately, so we can use it later to calculate the degrees of freedom for the gcv
319345
qt = sp.diags(diags_qtw, [0, 1, 2], (pcount - 2, pcount))
320-
qtw = ( qt @ sp.diags(diags_sqrw_recip, 0, (pcount, pcount)))
346+
qtw = (qt @ sp.diags(diags_sqrw_recip, 0, (pcount, pcount)))
321347
qtw = qtw @ qtw.T
322348

323349
p = smooth
@@ -359,11 +385,11 @@ def _make_spline(x, y, w, smooth, shape, normalizedsmooth, calculate_gcv):
359385

360386
c_shape = (4, pcount - 1) + shape[1:]
361387
c = np.vstack((c1, c2, c3, c4)).reshape(c_shape)
362-
363-
# As calculating the GCV is a relatively expensive operation, it's best to add the calculation inline and make it optional
364-
if calculate_gcv:
365-
degrees_of_freedom = p * (la.inv(p * W + ((1-p) * 6 * qt.T @ la.inv(r) @ qt)) @ W).trace()
366-
gcv = np.linalg.norm(y-y_pred)**2 / (y.size - degrees_of_freedom)**2
367-
return c, p, gcv
368-
369-
return c, p
388+
389+
# As calculating the GCV is a relatively expensive operation, we store the degrees of freedom
390+
# required for the GCV calculation
391+
if calculate_degrees_of_freedom:
392+
degrees_of_freedom = p * (la.inv(p * w + ((1-p) * 6 * qt.T @ la.inv(r) @ qt)) @ w).trace()
393+
return c, p, degrees_of_freedom
394+
else:
395+
return c, p, None

0 commit comments

Comments
 (0)