single-multivariate.c
1 //! @file single-multivariate.c
2 //! @author J. Marcel van der Veer
3 //!
4 //! @section Copyright
5 //!
6 //! This file is part of Algol68G - an Algol 68 compiler-interpreter.
7 //! Copyright 2001-2023 J. Marcel van der Veer [algol68g@xs4all.nl].
8 //!
9 //! @section License
10 //!
11 //! This program is free software; you can redistribute it and/or modify it
12 //! under the terms of the GNU General Public License as published by the
13 //! Free Software Foundation; either version 3 of the License, or
14 //! (at your option) any later version.
15 //!
16 //! This program is distributed in the hope that it will be useful, but
17 //! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 //! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 //! more details. You should have received a copy of the GNU General Public
20 //! License along with this program. If not, see [http://www.gnu.org/licenses/].
21
22 //! @section Synopsis
23 //!
24 //! REAL multivariate regression.
25
26 #include "a68g.h"
27
28 #if defined (HAVE_GSL)
29
30 #include "a68g-torrix.h"
31 #include "a68g-prelude-gsl.h"
32
33 //! @brief Compute pseudo_inverse := pseudo inverse (X)
34
35 void compute_pseudo_inverse (NODE_T *p, gsl_matrix **pseudo_inverse, gsl_matrix *X, REAL_T lim)
36 {
37 //
38 // The pseudo inverse gives a least-square approximate solution for a
39 // system of linear equations not having an exact solution.
40 // Multivariate statistics is a well known application.
41 //
42 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
43 MATH_RTE (p, lim < 0, M_REAL, "invalid limit");
44 unt M = SIZE1 (X), N = SIZE2 (X);
45 // GSL only handles M >= N, transpose commutes with pseudo inverse.
46 gsl_matrix *U;
47 BOOL_T transpose = (M < N);
48 if (transpose) {
49 M = SIZE2 (X); N = SIZE1 (X);
50 U = gsl_matrix_calloc (M, N);
51 gsl_matrix_transpose_memcpy (U, X);
52 } else {
53 U = gsl_matrix_calloc (M, N);
54 gsl_matrix_memcpy (U, X);
55 }
56 // A = USVᵀ by Jacobi, more precise than Golub-Reinsch.
57 // The GSL routine yields V, not Vᵀ. U is decomposed in place.
58 gsl_matrix *V = gsl_matrix_calloc (N, N);
59 gsl_vector *Sv = gsl_vector_calloc (N);
60 ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, Sv));
61 // Compute S⁻¹.
62 // Diagonal elements < 'lim' * max (singular values) are set to zero.
63 // Very small eigenvalues wreak havoc on a pseudo inverse.
64 // Python NumPy default is 1e-15, but assumes a Hermitian matrix.
65 // SVD gives singular values sorted in descending order.
66 REAL_T lwb = gsl_vector_get (Sv, 0) * (lim > 0 ? lim : 1e-9);
67 gsl_matrix *S_inv = gsl_matrix_calloc (N, M); // calloc sets matrix to 0.
68 for (unt i = 0; i < N; i++) {
69 REAL_T x = gsl_vector_get (Sv, i);
70 if (x > lwb) {
71 gsl_matrix_set (S_inv, i, i, 1 / x);
72 } else {
73 gsl_matrix_set (S_inv, i, i, 0);
74 }
75 }
76 gsl_vector_free (Sv);
77 // GSL SVD yields thin SVD - pad with zeros.
78 gsl_matrix *Uf = gsl_matrix_calloc (M, M);
79 for (unt i = 0; i < M; i++) {
80 for (unt j = 0; j < N; j++) {
81 gsl_matrix_set (Uf, i, j, gsl_matrix_get (U, i, j));
82 }
83 }
84 // Compute pseudo inverse A⁻¹ = VS⁻¹Uᵀ.
85 gsl_matrix *VS_inv = gsl_matrix_calloc (N, M);
86 gsl_matrix *X_inv = gsl_matrix_calloc (N, M);
87 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1, V, S_inv, 0, VS_inv));
88 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1, VS_inv, Uf, 0, X_inv));
89 // Compose result.
90 if (transpose) {
91 (*pseudo_inverse) = gsl_matrix_calloc (M, N);
92 gsl_matrix_transpose_memcpy ((*pseudo_inverse), X_inv);
93 } else {
94 (*pseudo_inverse) = gsl_matrix_calloc (N, M);
95 gsl_matrix_memcpy ((*pseudo_inverse), X_inv);
96 }
97 // Clean up.
98 gsl_matrix_free (S_inv);
99 gsl_matrix_free (U);
100 gsl_matrix_free (Uf);
101 gsl_matrix_free (V);
102 gsl_matrix_free (VS_inv);
103 gsl_matrix_free (X_inv);
104 }
105
106 //! @brief PROC pseudo inv = ([, ] REAL, REAL) [, ] REAL
107
108 void genie_matrix_pinv_lim (NODE_T * p)
109 {
110 // Compute the Moore-Penrose pseudo inverse of a MxN matrix.
111 // G. Strang. "Linear Algebra and its applications", 2nd edition.
112 // Academic Press [1980].
113 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
114 torrix_error_node = p;
115 A68_REAL lim;
116 POP_OBJECT (p, &lim, A68_REAL);
117 gsl_matrix *X = pop_matrix (p, A68_TRUE), *pseudo_inverse = NO_REAL_MATRIX;
118 compute_pseudo_inverse (p, &pseudo_inverse, X, VALUE (&lim));
119 push_matrix (p, pseudo_inverse);
120 gsl_matrix_free (pseudo_inverse);
121 gsl_matrix_free (X);
122 (void) gsl_set_error_handler (save_handler);
123 }
124
125 //! @brief OP PINV = ([, ] REAL) [, ] REAL
126
127 void genie_matrix_pinv (NODE_T * p)
128 {
129 PUSH_VALUE (p, 0, A68_REAL);
130 genie_matrix_pinv_lim (p);
131 }
132
133 //! @brief column-centered data matrix.
134
135 gsl_matrix *column_mean (gsl_matrix *DATA)
136 {
137 // M samples, N features.
138 unt M = SIZE1 (DATA), N = SIZE2 (DATA);
139 gsl_matrix *C = gsl_matrix_calloc (M, N);
140 for (unt i = 0; i < N; i++) {
141 DOUBLE_T sum = 0;
142 for (unt j = 0; j < M; j++) {
143 sum += gsl_matrix_get (DATA, j, i);
144 }
145 REAL_T mean = sum / M;
146 for (unt j = 0; j < M; j++) {
147 gsl_matrix_set (C, j, i, gsl_matrix_get (DATA, j, i) - mean);
148 }
149 }
150 return C;
151 }
152
153 //! @brief take left columns from matrix.
154
155 void genie_left_columns (NODE_T *p)
156 {
157 A68_INT k;
158 POP_OBJECT (p, &k, A68_INT);
159 gsl_matrix *X = pop_matrix (p, A68_TRUE);
160 unt M = SIZE1 (X), N = SIZE2 (X), cols = VALUE (&k);
161 MATH_RTE (p, cols < 0, M_INT, "invalid number of columns");
162 if (cols == 0 || cols > N) {
163 cols = N;
164 }
165 gsl_matrix *Y = gsl_matrix_calloc (M, cols);
166 for (unt i = 0; i < M; i++) {
167 for (unt j = 0; j < cols; j++) {
168 gsl_matrix_set (Y, i, j, gsl_matrix_get (X, i, j));
169 }
170 }
171 push_matrix (p, Y);
172 gsl_matrix_free (X);
173 gsl_matrix_free (Y);
174 }
175
176 //! @brief OP MEAN = ([, ] REAL) [, ] REAL
177
178 void genie_matrix_column_mean (NODE_T *p)
179 {
180 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
181 torrix_error_node = p;
182 gsl_matrix *Z = pop_matrix (p, A68_TRUE);
183 unt M = SIZE1 (Z), N = SIZE2 (Z);
184 gsl_matrix *A = gsl_matrix_calloc (M, N);
185 for (unt i = 0; i < N; i++) {
186 DOUBLE_T sum = 0;
187 for (unt j = 0; j < M; j++) {
188 sum += gsl_matrix_get (Z, j, i);
189 }
190 DOUBLE_T mean = sum / M;
191 for (unt j = 0; j < M; j++) {
192 gsl_matrix_set (A, j, i, mean);
193 }
194 }
195 push_matrix (p, A);
196 gsl_matrix_free (A);
197 gsl_matrix_free (Z);
198 (void) gsl_set_error_handler (save_handler);
199 }
200
201 //! @brief compute PCA of MxN matrix from covariance matrix
202 //! @param eigen_values vector with eigen values on output
203 //! @param X data matrix, samples in rows, features in columns
204
205 gsl_matrix *compute_pca_cv (NODE_T *p, gsl_vector **eigen_values, gsl_matrix *X)
206 {
207 //
208 // Forming the covariance matrix squares the condition number.
209 // Hence this routine might loose more significant digits than SVD.
210 // On the other hand, using PCA one looks for dominant eigenvectors.
211 //
212 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
213 // M samples, N features.
214 unt M = SIZE1 (X), N = SIZE2 (X);
215 // Keep X column-mean-centered.
216 gsl_matrix *C = column_mean (X);
217 // Covariance matrix, M samples: Cov = XᵀX.
218 M = MAX (M, N);
219 gsl_matrix *CV = gsl_matrix_calloc (M, M);
220 gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1, C, C, 0, CV);
221 // Compute and sort eigenvectors.
222 gsl_vector *Ev = gsl_vector_calloc (M);
223 gsl_matrix *eigen_vectors = gsl_matrix_calloc (M, M);
224 gsl_eigen_symmv_workspace *W = gsl_eigen_symmv_alloc (M);
225 ASSERT_GSL (gsl_eigen_symmv (CV, Ev, eigen_vectors, W));
226 ASSERT_GSL (gsl_eigen_symmv_sort (Ev, eigen_vectors, GSL_EIGEN_SORT_ABS_DESC));
227 // Return eigen values, if required.
228 if (eigen_values != NO_REF_VECTOR) {
229 (*eigen_values) = gsl_vector_calloc (N);
230 ASSERT_GSL (gsl_vector_memcpy ((*eigen_values), Ev));
231 }
232 // Clean up.
233 gsl_eigen_symmv_free (W);
234 gsl_matrix_free (C);
235 gsl_matrix_free (CV);
236 gsl_vector_free (Ev);
237 return eigen_vectors;
238 }
239
240 //! @brief PROC pca cv = ([, ] REAL) [, ] REAL
241
242 void genie_matrix_pca_cv (NODE_T * p)
243 {
244 // Principal component analysis of a MxN matrix.
245 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
246 torrix_error_node = p;
247 A68_REF ref_row;
248 POP_REF (p, &ref_row);
249 gsl_matrix *X = pop_matrix (p, A68_TRUE);
250 gsl_vector *Ev;
251 gsl_matrix *eigen_vectors = compute_pca_cv (p, &Ev, X);
252 *DEREF (A68_REF, &ref_row) = vector_to_row (p, Ev);
253 push_matrix (p, eigen_vectors);
254 gsl_matrix_free (eigen_vectors);
255 gsl_matrix_free (X);
256 (void) gsl_set_error_handler (save_handler);
257 }
258
259 //! @brief compute PCA of MxN matrix by Jacobi SVD
260 //! @param singular_values vector with singular values on output
261 //! @param X data matrix, samples in rows, features in columns
262
263 gsl_matrix *compute_pca_svd (NODE_T *p, gsl_vector **singular_values, gsl_matrix *X)
264 {
265 //
266 // In PCA, SVD is closely related to eigenvector decomposition of the covariance matrix:
267 // If Cov = XᵀX = VLVᵀ and X = USVᵀ then XᵀX = VSUᵀUSVᵀ = VS²Vᵀ, hence L = S².
268 //
269 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
270 // Keep X column-mean-centered.
271 gsl_matrix *C = column_mean (X), *U;
272 // M samples, N features.
273 unt M = SIZE1 (X), N = SIZE2 (X);
274 // GSL does thin SVD, only handles M >= N, overdetermined systems.
275 BOOL_T transpose = (M < N);
276 if (transpose) {
277 // Xᵀ = VSUᵀ
278 M = SIZE2 (X); N = SIZE1 (X);
279 U = gsl_matrix_calloc (M, N);
280 gsl_matrix_transpose_memcpy (U, C);
281 } else {
282 // X = USVᵀ
283 U = gsl_matrix_calloc (M, N);
284 gsl_matrix_memcpy (U, C);
285 }
286 // X = USVᵀ by Jacobi, more precise than Golub-Reinsch.
287 // GSL routine yields V, not Vᵀ, U develops in place.
288 gsl_matrix *V = gsl_matrix_calloc (N, N);
289 gsl_vector *Sv = gsl_vector_calloc (N);
290 ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, Sv));
291 // Return singular values, if required.
292 if (singular_values != NO_REF_VECTOR) {
293 (*singular_values) = gsl_vector_calloc (N);
294 ASSERT_GSL (gsl_vector_memcpy ((*singular_values), Sv));
295 }
296 gsl_matrix *eigen_vectors = gsl_matrix_calloc (N, M);
297 if (transpose) {
298 eigen_vectors = gsl_matrix_calloc (M, N);
299 ASSERT_GSL (gsl_matrix_memcpy (eigen_vectors, U));
300 } else {
301 eigen_vectors = gsl_matrix_calloc (M, N);
302 ASSERT_GSL (gsl_matrix_memcpy (eigen_vectors, V));
303 }
304 // Clean up.
305 gsl_matrix_free (C);
306 gsl_matrix_free (U);
307 gsl_matrix_free (V);
308 gsl_vector_free (Sv);
309 // Done.
310 return eigen_vectors;
311 }
312
313 //! @brief PROC pca svd = ([, ] REAL, REF [] REAL) [, ] REAL
314
315 void genie_matrix_pca_svd (NODE_T * p)
316 {
317 // Principal component analysis of a MxN matrix.
318 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
319 torrix_error_node = p;
320 A68_REF ref_row;
321 POP_REF (p, &ref_row);
322 gsl_matrix *X = pop_matrix (p, A68_TRUE);
323 gsl_vector *Sv;
324 gsl_matrix *eigen_vectors = compute_pca_svd (p, &Sv, X);
325 if (!IS_NIL (ref_row)) {
326 *DEREF (A68_REF, &ref_row) = vector_to_row (p, Sv);
327 }
328 push_matrix (p, eigen_vectors);
329 gsl_matrix_free (eigen_vectors);
330 gsl_matrix_free (X);
331 (void) gsl_set_error_handler (save_handler);
332 }
333
334
335 static gsl_matrix *mat_before_ab (NODE_T *p, gsl_matrix *u, gsl_matrix *v)
336 {
337 // Form A := A BEFORE B. Deallocate A, otherwise PLS1 leaks memory.
338 gsl_matrix *w = matrix_hcat (p, u, v);
339 if (u != NO_REAL_MATRIX) {
340 gsl_matrix_free (u);
341 }
342 return w;
343 }
344
345 static gsl_matrix *mat_over_ab (NODE_T *p, gsl_matrix *u, gsl_matrix *v)
346 {
347 // Form A := A OVER B. Deallocate A, otherwise PLS1 leaks memory.
348 gsl_matrix *w = matrix_vcat (p, u, v);
349 if (u != NO_REAL_MATRIX) {
350 gsl_matrix_free (u);
351 }
352 return w;
353 }
354
355 //! @brief PROC pls1 = ([, ] REAL, [, ] REAL, INT, REF [] REAL) [, ] REAL
356
357 void genie_matrix_pls1 (NODE_T *p)
358 {
359 // PLS decomposes X and Y data concurrently as
360 //
361 // X = T Pᵀ + E
362 // Y = U Qᵀ + F
363 //
364 // PLS1 is a widely used algorithm appropriate for the vector Y case.
365 //
366 // PLS1 with SVD solver for beta.
367 // NIPALS algorithm with orthonormal scores and loadings.
368 // See Ulf Indahl, Journal of Chemometrics 28(3) 168-180 [2014].
369 //
370 // Note that E is an MxN matrix and f, beta are Nx1 column vectors.
371 // This for consistency with PLS2.
372 //
373 // Decompose Nc eigenvectors.
374 //
375 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
376 torrix_error_node = p;
377 // Pop arguments.
378 A68_REF ref_eigenv;
379 POP_REF (p, &ref_eigenv);
380 A68_INT Nc;
381 POP_OBJECT (p, &Nc, A68_INT);
382 gsl_matrix *F = pop_matrix (p, A68_TRUE);
383 gsl_matrix *E = pop_matrix (p, A68_TRUE);
384 // Catch wrong calls.
385 unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
386 MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
387 MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
388 MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
389 MATH_RTE (p, Nf != 1, M_ROW_ROW_REAL, "F must be a column vector");
390 MATH_RTE (p, VALUE (&Nc) < 0, M_INT, "invalid number of components");
391 CHECK_INT_SHORTEN(p, VALUE (&Nc));
392 // Sensible defaults.
393 int Nk = VALUE (&Nc);
394 if (Nk > Ne) {
395 Nk = Ne;
396 } else if (Nk == 0) { // All eigenvectors.
397 Nk = Me;
398 }
399 // Decompose E and F.
400 gsl_matrix *EIGEN = NO_REAL_MATRIX, *nE = NO_REAL_MATRIX, *nF = NO_REAL_MATRIX;
401 gsl_matrix *eig = gsl_matrix_calloc (Ne, 1);
402 gsl_matrix *lat = gsl_matrix_calloc (Me, 1);
403 gsl_matrix *pl = gsl_matrix_calloc (Ne, 1);
404 gsl_matrix *ql = gsl_matrix_calloc (Nf, 1); // 1x1 in PLS1
405 gsl_vector *Ev = gsl_vector_calloc (Nk);
406 // Latent variable deflation.
407 for (int k = 0; k < Nk; k ++) {
408 // E weight from E, F covariance.
409 // eig = Eᵀ * f / | Eᵀ * f |
410 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, E, F, 0.0, eig));
411 REAL_T norm = matrix_norm (eig);
412 ASSERT_GSL (gsl_matrix_scale (eig, 1.0 / norm));
413 gsl_vector_set (Ev, k, norm);
414 // Compute latent variable.
415 // lat = E * eig / | E * eig |
416 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, E, eig, 0.0, lat));
417 norm = matrix_norm (lat);
418 ASSERT_GSL (gsl_matrix_scale (lat, 1.0 / norm));
419 // Deflate E and F, remove latent variable from both.
420 // pl = Eᵀ * lat; E -= lat * plᵀ and ql = Fᵀ * lat; F -= lat * qlᵀ
421 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, E, lat, 0.0, pl));
422 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasTrans, -1.0, lat, pl, 1.0, E));
423 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, lat, F, 0.0, ql));
424 // Build matrices.
425 EIGEN = mat_before_ab (p, EIGEN, eig);
426 nE = mat_before_ab (p, nE, pl); // P
427 nF = mat_over_ab (p, nF, ql); // Qᵀ
428 }
429 // Intermediate garbage collector.
430 gsl_matrix_free (E);
431 gsl_matrix_free (F);
432 gsl_matrix_free (eig);
433 gsl_matrix_free (lat);
434 gsl_matrix_free (pl);
435 gsl_matrix_free (ql);
436 // Projection of original data = Eᵀ * EIGEN
437 gsl_matrix *Pr = gsl_matrix_calloc (SIZE2 (nE), SIZE2 (EIGEN));
438 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, nE, EIGEN, 0.0, Pr));
439 //
440 // Now beta = EIGEN * Pr⁻¹ * nF
441 // Compute with SVD to avoid matrix inversion.
442 // Matrix inversion is a source of numerical noise!
443 //
444 gsl_matrix *u, *v; gsl_vector *s, *w;
445 unt M = SIZE1 (Pr), N = SIZE2 (Pr);
446 // GSL computes thin SVD, M >= N only, returning V, not Vᵀ.
447 if (M >= N) {
448 // X = USVᵀ
449 s = gsl_vector_calloc (N);
450 u = gsl_matrix_calloc (M, N);
451 v = gsl_matrix_calloc (N, N);
452 w = gsl_vector_calloc (N);
453 gsl_matrix_memcpy (u, Pr);
454 ASSERT_GSL (gsl_linalg_SV_decomp (u, v, s, w));
455 } else {
456 // Xᵀ = VSUᵀ
457 s = gsl_vector_calloc (M);
458 u = gsl_matrix_calloc (N, M);
459 v = gsl_matrix_calloc (M, M);
460 w = gsl_vector_calloc (M);
461 gsl_matrix_transpose_memcpy (u, Pr);
462 ASSERT_GSL (gsl_linalg_SV_decomp (u, v, s, w));
463 }
464 // Kill short eigenvectors cf. Python numpy.
465 for (int k = 1; k < SIZE (s); k++) {
466 if (gsl_vector_get (s, k) / gsl_vector_get (s, 0) < 1e-15) {
467 gsl_vector_set (s, k, 0.0);
468 }
469 }
470 // Solve for beta.
471 gsl_vector *nFv = gsl_vector_calloc (Nk), *x = gsl_vector_calloc (Nk);
472 gsl_matrix_get_col (nFv, nF, 0);
473 if (M >= N) {
474 ASSERT_GSL (gsl_linalg_SV_solve (u, v, s, nFv, x));
475 } else {
476 ASSERT_GSL (gsl_linalg_SV_solve (v, u, s, nFv, x));
477 }
478 gsl_matrix *beta = gsl_matrix_calloc (Ne, 1), *xmat = gsl_matrix_calloc (Nk, 1);
479 ASSERT_GSL (gsl_matrix_set_col (xmat, 0, x));
480 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, EIGEN, xmat, 0.0, beta));
481 // Yield results.
482 if (!IS_NIL (ref_eigenv)) {
483 *DEREF (A68_REF, &ref_eigenv) = vector_to_row (p, Ev);
484 }
485 push_matrix (p, beta);
486 // Garbage collector.
487 gsl_matrix_free (beta);
488 gsl_matrix_free (EIGEN);
489 gsl_matrix_free (nE);
490 gsl_matrix_free (nF);
491 gsl_matrix_free (Pr);
492 gsl_matrix_free (u);
493 gsl_matrix_free (v);
494 gsl_matrix_free (xmat);
495 gsl_vector_free (Ev);
496 gsl_vector_free (nFv);
497 gsl_vector_free (s);
498 gsl_vector_free (w);
499 gsl_vector_free (x);
500 //
501 (void) gsl_set_error_handler (save_handler);
502 }
503
504 //! @brief PROC pls1 lim = ([, ] REAL, [, ] REAL, REAL, REF [] REAL) [, ] REAL
505
506 void genie_matrix_pls1_lim (NODE_T *p)
507 {
508 //
509 // PLS1 with SVD solver for beta.
510 // NIPALS algorithm with orthonormal scores and loadings.
511 // See Ulf Indahl, Journal of Chemometrics 28(3) 168-180 [2014].
512 //
513 // Note that E is an MxN matrix and f, beta are Nx1 column vectors.
514 // This for consistency with PLS2.
515 //
516 // Decompose eigenvectors until relative size is too short.
517 //
518 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
519 torrix_error_node = p;
520 // Pop arguments.
521 A68_REF ref_eigenv;
522 POP_REF (p, &ref_eigenv);
523 // 'lim' is minimum relative length to first eigenvector.
524 A68_REAL lim;
525 POP_OBJECT (p, &lim, A68_REAL);
526 gsl_matrix *F = pop_matrix (p, A68_TRUE);
527 gsl_matrix *E = pop_matrix (p, A68_TRUE);
528 // Catch wrong calls.
529 unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
530 MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
531 MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
532 MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
533 MATH_RTE (p, Nf != 1, M_ROW_ROW_REAL, "F must be a column vector");
534 MATH_RTE (p, VALUE (&lim) < 0 || VALUE (&lim) > 1.0, M_REAL, "invalid relative length");
535 // Sensible defaults.
536 int Nk = MIN (Ne, Mf);
537 // Decompose E and F.
538 gsl_matrix *EIGEN = NO_REAL_MATRIX, *nE = NO_REAL_MATRIX, *nF = NO_REAL_MATRIX;
539 gsl_matrix *eig = gsl_matrix_calloc (Ne, 1);
540 gsl_matrix *lat = gsl_matrix_calloc (Me, 1);
541 gsl_matrix *pl = gsl_matrix_calloc (Ne, 1);
542 gsl_matrix *ql = gsl_matrix_calloc (Nf, 1); // 1x1 in PLS1
543 gsl_vector *Ev = gsl_vector_calloc (Nk);
544 // Latent variable deflation.
545 int go_on = A68_TRUE;
546 for (int k = 0; k < Nk && go_on; k ++) {
547 // E weight from E, F covariance.
548 // eig = Eᵀ * f / | Eᵀ * f |
549 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, E, F, 0.0, eig));
550 REAL_T norm = matrix_norm (eig);
551 ASSERT_GSL (gsl_matrix_scale (eig, 1.0 / norm));
552 gsl_vector_set (Ev, k, norm);
553 // Compute latent variable.
554 // lat = E * eig / | E * eig |
555 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, E, eig, 0.0, lat));
556 norm = matrix_norm (lat);
557 ASSERT_GSL (gsl_matrix_scale (lat, 1.0 / norm));
558 // Deflate E and F, remove latent variable from both.
559 // pl = Eᵀ * lat; E -= lat * plᵀ and ql = Fᵀ * lat; F -= lat * qlᵀ
560 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, E, lat, 0.0, pl));
561 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasTrans, -1.0, lat, pl, 1.0, E));
562 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, lat, F, 0.0, ql));
563 // Build matrices.
564 EIGEN = mat_before_ab (p, EIGEN, eig);
565 nE = mat_before_ab (p, nE, pl); // P
566 nF = mat_over_ab (p, nF, ql); // Qᵀ
567 // Another iteration?
568 if (k > 0 && gsl_vector_get (Ev, k) / gsl_vector_get (Ev, 0) < VALUE (&lim)) {
569 Nk = k + 1;
570 go_on = A68_FALSE;
571 }
572 }
573 // Intermediate garbage collector.
574 gsl_matrix_free (E);
575 gsl_matrix_free (F);
576 gsl_matrix_free (eig);
577 gsl_matrix_free (lat);
578 gsl_matrix_free (pl);
579 gsl_matrix_free (ql);
580 // Projection of original data = Eᵀ * EIGEN
581 gsl_matrix *Pr = gsl_matrix_calloc (SIZE2 (nE), SIZE2 (EIGEN));
582 ASSERT_GSL (gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, nE, EIGEN, 0.0, Pr));
583 //
584 // Now beta = EIGEN * Pr⁻¹ * nF
585 // Compute with SVD to avoid matrix inversion.
586 // Matrix inversion is a source of numerical noise!
587 //
588 gsl_matrix *u, *v; gsl_vector *s, *w;
589 unt M = SIZE1 (Pr), N = SIZE2 (Pr);
590 // GSL computes thin SVD, M >= N only, returning V, not Vᵀ.
591 if (M >= N) {
592 // X = USVᵀ
593 s = gsl_vector_calloc (N);
594 u = gsl_matrix_calloc (M, N);
595 v = gsl_matrix_calloc (N, N);
596 w = gsl_vector_calloc (N);
597 gsl_matrix_memcpy (u, Pr);
598 ASSERT_GSL (gsl_linalg_SV_decomp (u, v, s, w));
599 } else {
600 // Xᵀ = VSUᵀ
601 s = gsl_vector_calloc (M);
602 u = gsl_matrix_calloc (N, M);
603 v = gsl_matrix_calloc (M, M);
604 w = gsl_vector_calloc (M);
605 gsl_matrix_transpose_memcpy (u, Pr);
606 ASSERT_GSL (gsl_linalg_SV_decomp (u, v, s, w));
607 }
608 // Kill short eigenvectors cf. Python numpy.
609 for (int k = 1; k < SIZE (s); k++) {
610 if (gsl_vector_get (s, k) / gsl_vector_get (s, 0) < 1e-15) {
611 gsl_vector_set (s, k, 0.0);
612 }
613 }
614 // Solve for beta.
615 gsl_vector *nFv = gsl_vector_calloc (Nk), *x = gsl_vector_calloc (Nk);
616 gsl_matrix_get_col (nFv, nF, 0);
617 if (M >= N) {
618 ASSERT_GSL (gsl_linalg_SV_solve (u, v, s, nFv, x));
619 } else {
620 ASSERT_GSL (gsl_linalg_SV_solve (v, u, s, nFv, x));
621 }
622 gsl_matrix *beta = gsl_matrix_calloc (Ne, 1), *xmat = gsl_matrix_calloc (Nk, 1);
623 ASSERT_GSL (gsl_matrix_set_col (xmat, 0, x));
624 ASSERT_GSL (gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, EIGEN, xmat, 0.0, beta));
625 // Yield results.
626 if (!IS_NIL (ref_eigenv)) {
627 gsl_vector *Evl = gsl_vector_calloc (Nk);
628 for (unt k = 0; k < Nk; k++) {
629 gsl_vector_set (Evl, k, gsl_vector_get (Evl, k));
630 }
631 *DEREF (A68_REF, &ref_eigenv) = vector_to_row (p, Evl);
632 gsl_vector_free (Evl);
633 }
634 push_matrix (p, beta);
635 // Garbage collector.
636 gsl_matrix_free (beta);
637 gsl_matrix_free (EIGEN);
638 gsl_matrix_free (nE);
639 gsl_matrix_free (nF);
640 gsl_matrix_free (Pr);
641 gsl_matrix_free (u);
642 gsl_matrix_free (v);
643 gsl_matrix_free (xmat);
644 gsl_vector_free (Ev);
645 gsl_vector_free (nFv);
646 gsl_vector_free (s);
647 gsl_vector_free (w);
648 gsl_vector_free (x);
649 //
650 (void) gsl_set_error_handler (save_handler);
651 }
652
653 #endif