single-multivariate.c
1 //! @file single-multivariate.c
2 //! @author J. Marcel van der Veer
3
4 //! @section Copyright
5 //!
6 //! This file is part of Algol68G - an Algol 68 compiler-interpreter.
7 //! Copyright 2001-2024 J. Marcel van der Veer [algol68g@xs4all.nl].
8
9 //! @section License
10 //!
11 //! This program is free software; you can redistribute it and/or modify it
12 //! under the terms of the GNU General Public License as published by the
13 //! Free Software Foundation; either version 3 of the License, or
14 //! (at your option) any later version.
15 //!
16 //! This program is distributed in the hope that it will be useful, but
17 //! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 //! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 //! more details. You should have received a copy of the GNU General Public
20 //! License along with this program. If not, see [http://www.gnu.org/licenses/].
21
22 //! @section Synopsis
23 //!
24 //! REAL multivariate regression.
25
26 // This code implements least square regression methods:
27 // OLS - Ordinary Least Squares
28 // PCR - Principle Component Regression, OLS after dimension reduction
29 // TLS - Total Least Squares
30 // PLS - Partial Least Squares, TLS after dimension reduction
31
32 #include "a68g.h"
33
34 #if defined (HAVE_GSL)
35
36 #include "a68g-torrix.h"
37 #include "a68g-prelude-gsl.h"
38
39 #define SMALL_EIGEN 1e-9
40
41 //! @brief column-centered data matrix.
42
43 gsl_matrix *column_mean (gsl_matrix *DATA)
44 {
45 // A is MxN; M samples x N features.
46 unt M = SIZE1 (DATA), N = SIZE2 (DATA);
47 gsl_matrix *C = gsl_matrix_calloc (M, N);
48 for (int i = 0; i < N; i++) {
49 DOUBLE_T sum = 0;
50 for (int j = 0; j < M; j++) {
51 sum += gsl_matrix_get (DATA, j, i);
52 }
53 REAL_T mean = sum / M;
54 for (int j = 0; j < M; j++) {
55 gsl_matrix_set (C, j, i, gsl_matrix_get (DATA, j, i) - mean);
56 }
57 }
58 return C;
59 }
60
61 //! @brief left columns from matrix.
62
63 static gsl_matrix *left_columns (NODE_T *p, gsl_matrix *X, int cols)
64 {
65 MATH_RTE (p, cols < 0, M_INT, "invalid number of columns");
66 unt M = SIZE1 (X), N = SIZE2 (X);
67 if (cols == 0 || cols > N) {
68 cols = N;
69 }
70 gsl_matrix *Y = gsl_matrix_calloc (M, cols);
71 for (int i = 0; i < M; i++) {
72 for (int j = 0; j < cols; j++) {
73 gsl_matrix_set (Y, i, j, gsl_matrix_get (X, i, j));
74 }
75 }
76 return Y;
77 }
78
79 //! @brief Compute Moore-Penrose pseudo inverse.
80
81 void compute_pseudo_inverse (NODE_T *p, gsl_matrix **mpinv, gsl_matrix *X, REAL_T lim)
82 {
83 // The Moore-Penrose pseudo inverse gives a least-square approximate solution
84 // for a system of linear equations not having an exact solution.
85 // Multivariate statistics is a well known application.
86 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
87 MATH_RTE (p, lim < 0, M_REAL, "invalid limit");
88 unt M = SIZE1 (X), N = SIZE2 (X);
89 // GSL only handles M >= N, transpose commutes with pseudo inverse.
90 gsl_matrix *U;
91 BOOL_T transpose = (M < N);
92 if (transpose) {
93 M = SIZE2 (X); N = SIZE1 (X);
94 U = gsl_matrix_calloc (M, N);
95 gsl_matrix_transpose_memcpy (U, X);
96 } else {
97 U = gsl_matrix_calloc (M, N);
98 gsl_matrix_memcpy (U, X);
99 }
100 // A = USVᵀ by Jacobi, more precise than Golub-Reinsch.
101 // The GSL routine yields V, not Vᵀ. U is decomposed in place.
102 gsl_matrix *V = gsl_matrix_calloc (N, N);
103 gsl_vector *Sv = gsl_vector_calloc (N);
104 ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, Sv));
105 // Compute S⁻¹.
106 // Very small eigenvalues wreak havoc on a pseudo inverse.
107 // So diagonal elements < 'lim * largest element' are set to zero.
108 // Python NumPy default is 1e-15, but assumes a Hermitian matrix.
109 // SVD gives singular values sorted in descending order.
110 REAL_T x0 = gsl_vector_get (Sv, 0);
111 MATH_RTE (p, x0 == 0, M_ROW_ROW_REAL, "zero eigenvalue or singular value");
112 gsl_matrix *S_inv = gsl_matrix_calloc (N, M);
113 gsl_matrix_set (S_inv, 0, 0, 1 / x0);
114 for (int i = 1; i < N; i++) {
115 REAL_T x = gsl_vector_get (Sv, i);
116 if ((x / x0) > lim) {
117 gsl_matrix_set (S_inv, i, i, 1 / x);
118 } else {
119 gsl_matrix_set (S_inv, i, i, 0);
120 }
121 }
122 a68_vector_free (Sv);
123 // GSL SVD yields thin SVD - pad with zeros.
124 gsl_matrix *Uf = gsl_matrix_calloc (M, M);
125 for (int i = 0; i < M; i++) {
126 for (int j = 0; j < N; j++) {
127 gsl_matrix_set (Uf, i, j, gsl_matrix_get (U, i, j));
128 }
129 }
130 // Compute pseudo inverse A⁻¹ = VS⁻¹Uᵀ.
131 gsl_matrix *VS_inv = NO_REAL_MATRIX, *X_inv = NO_REAL_MATRIX;
132 a68_dgemm (SELF, SELF, 1, V, S_inv, 0, &VS_inv);
133 a68_dgemm (SELF, FLIP, 1, VS_inv, Uf, 0, &X_inv);
134 // Compose result.
135 if (transpose) {
136 (*mpinv) = gsl_matrix_calloc (M, N);
137 gsl_matrix_transpose_memcpy ((*mpinv), X_inv);
138 } else {
139 (*mpinv) = gsl_matrix_calloc (N, M);
140 gsl_matrix_memcpy ((*mpinv), X_inv);
141 }
142 // Clean up.
143 a68_matrix_free (S_inv);
144 a68_matrix_free (U);
145 a68_matrix_free (Uf);
146 a68_matrix_free (V);
147 a68_matrix_free (VS_inv);
148 a68_matrix_free (X_inv);
149 }
150
151 //! @brief Compute Principal Component Analysis by SVD.
152
153 gsl_matrix *pca_svd (gsl_matrix *X, gsl_vector ** Sv)
154 {
155 // In PCA, SVD is closely related to eigenvector decomposition of the covariance matrix:
156 // If Cov = XᵀX = VLVᵀ and X = USVᵀ then XᵀX = VSUᵀUSVᵀ = VS²Vᵀ, hence L = S².
157 // M samples, N features.
158 unt M = SIZE1 (X), N = SIZE2 (X);
159 // GSL does thin SVD, only handles M >= N, overdetermined systems.
160 BOOL_T transpose = (M < N);
161 gsl_matrix *U = NO_REAL_MATRIX;
162 if (transpose) {
163 // Xᵀ = VSUᵀ
164 M = SIZE2 (X); N = SIZE1 (X);
165 U = gsl_matrix_calloc (M, N);
166 gsl_matrix_transpose_memcpy (U, X);
167 } else {
168 // X = USVᵀ
169 U = gsl_matrix_calloc (M, N);
170 gsl_matrix_memcpy (U, X);
171 }
172 // X = USVᵀ by Jacobi, more precise than Golub-Reinsch.
173 // GSL routine yields V, not Vᵀ, U develops in place.
174 gsl_matrix *V = gsl_matrix_calloc (N, N);
175 (*Sv) = gsl_vector_calloc (N);
176 ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, (*Sv)));
177 // Return singular values, if required.
178 gsl_matrix *eigens = gsl_matrix_calloc (M, N);
179 if (transpose) {
180 ASSERT_GSL (gsl_matrix_memcpy (eigens, U));
181 } else {
182 ASSERT_GSL (gsl_matrix_memcpy (eigens, V));
183 }
184 a68_matrix_free (U);
185 a68_matrix_free (V);
186 return eigens;
187 }
188
189 //! @brief PROC pseudo inv = ([, ] REAL, REAL) [, ] REAL
190
191 void genie_matrix_pinv_lim (NODE_T * p)
192 {
193 // Compute the Moore-Penrose pseudo inverse of a MxN matrix.
194 // G. Strang. "Linear Algebra and its applications", 2nd edition.
195 // Academic Press [1980].
196 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
197 A68_REAL lim;
198 POP_OBJECT (p, &lim, A68_REAL);
199 gsl_matrix *X = pop_matrix (p, A68_TRUE), *mpinv = NO_REAL_MATRIX;
200 compute_pseudo_inverse (p, &mpinv, X, VALUE (&lim));
201 push_matrix (p, mpinv);
202 a68_matrix_free (mpinv);
203 a68_matrix_free (X);
204 (void) gsl_set_error_handler (save_handler);
205 }
206
207 //! @brief OP PINV = ([, ] REAL) [, ] REAL
208
209 void genie_matrix_pinv (NODE_T * p)
210 {
211 PUSH_VALUE (p, SMALL_EIGEN, A68_REAL);
212 genie_matrix_pinv_lim (p);
213 }
214
215 //! @brief OP MEAN = ([, ] REAL) [, ] REAL
216
217 void genie_matrix_column_mean (NODE_T *p)
218 {
219 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
220 gsl_matrix *Z = pop_matrix (p, A68_TRUE);
221 unt M = SIZE1 (Z), N = SIZE2 (Z);
222 gsl_matrix *A = gsl_matrix_calloc (M, N);
223 for (int i = 0; i < N; i++) {
224 DOUBLE_T sum = 0;
225 for (int j = 0; j < M; j++) {
226 sum += gsl_matrix_get (Z, j, i);
227 }
228 DOUBLE_T mean = sum / M;
229 for (int j = 0; j < M; j++) {
230 gsl_matrix_set (A, j, i, mean);
231 }
232 }
233 push_matrix (p, A);
234 a68_matrix_free (A);
235 a68_matrix_free (Z);
236 (void) gsl_set_error_handler (save_handler);
237 }
238
239 //! @brief PROC left columns = ([, ] REAL, INT) [, ] REAL
240
241 void genie_left_columns (NODE_T *p)
242 {
243 A68_INT k;
244 POP_OBJECT (p, &k, A68_INT);
245 gsl_matrix *X = pop_matrix (p, A68_TRUE);
246 gsl_matrix *Y = left_columns (p, X, VALUE (&k));
247 push_matrix (p, Y);
248 a68_matrix_free (X);
249 a68_matrix_free (Y);
250 }
251
252 //! @brief PROC pca cv = ([, ] REAL, REF [] REAL) [, ] REAL
253
254 void genie_matrix_pca_cv (NODE_T * p)
255 {
256 // Principal component analysis of a MxN matrix from a covariance matrix.
257 // Forming the covariance matrix squares the condition number.
258 // Hence this routine might loose more significant digits than SVD.
259 // On the other hand, using PCA one looks for dominant eigenvectors.
260 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
261 A68_REF singulars;
262 POP_REF (p, &singulars);
263 gsl_matrix *X = pop_matrix (p, A68_TRUE);
264 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
265 // M samples, N features.
266 unt M = SIZE1 (X), N = SIZE2 (X);
267 // Keep X column-mean-centered.
268 gsl_matrix *C = column_mean (X);
269 // Covariance matrix, M samples: Cov = XᵀX.
270 M = MAX (M, N);
271 gsl_matrix *CV = NO_REAL_MATRIX;
272 a68_dgemm (FLIP, SELF, 1, C, C, 0, &CV);
273 // Compute and sort eigenvectors.
274 gsl_vector *Sv = gsl_vector_calloc (M);
275 gsl_matrix *eigens = gsl_matrix_calloc (M, M);
276 gsl_eigen_symmv_workspace *W = gsl_eigen_symmv_alloc (M);
277 ASSERT_GSL (gsl_eigen_symmv (CV, Sv, eigens, W));
278 ASSERT_GSL (gsl_eigen_symmv_sort (Sv, eigens, GSL_EIGEN_SORT_ABS_DESC));
279 // Yield results.
280 if (!IS_NIL (singulars)) {
281 *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
282 }
283 push_matrix (p, eigens);
284 (void) gsl_set_error_handler (save_handler);
285 // Garbage collector
286 a68_matrix_free (eigens);
287 a68_matrix_free (X);
288 gsl_eigen_symmv_free (W);
289 a68_matrix_free (C);
290 a68_matrix_free (CV);
291 a68_vector_free (Sv);
292 }
293
294 //! @brief PROC pca svd = ([, ] REAL, REF [] REAL) [, ] REAL
295
296 void genie_matrix_pca_svd (NODE_T * p)
297 {
298 // Principal component analysis of a MxN matrix by Jacobi SVD.
299 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
300 A68_REF singulars;
301 POP_REF (p, &singulars);
302 // X data matrix, samples in rows, features in columns
303 gsl_matrix *X = pop_matrix (p, A68_TRUE);
304 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
305 // Keep X column-mean-centered.
306 gsl_matrix *C = column_mean (X);
307 gsl_vector *Sv = NO_REAL_VECTOR;
308 gsl_matrix *eigens = pca_svd (C, &Sv);
309 // Yield results.
310 if (!IS_NIL (singulars)) {
311 *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
312 }
313 push_matrix (p, eigens);
314 (void) gsl_set_error_handler (save_handler);
315 // Clean up.
316 a68_matrix_free (eigens);
317 a68_matrix_free (X);
318 a68_matrix_free (C);
319 if (Sv != NO_REAL_VECTOR) {
320 a68_vector_free (Sv);
321 }
322 }
323
324 //! @brief PROC pcr = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
325
326 void genie_matrix_pcr (NODE_T * p)
327 {
328 // Principal Component Regression of a MxN matrix by Jacobi SVD.
329 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
330 // Pop arguments.
331 // 'lim' is minimum relative length to first eigenvector.
332 A68_REAL lim;
333 POP_OBJECT (p, &lim, A68_REAL);
334 // 'Nc' is maximum number of eigenvectors.
335 A68_INT Nc;
336 POP_OBJECT (p, &Nc, A68_INT);
337 A68_REF singulars;
338 POP_REF (p, &singulars);
339 // Y data vector, features in a single column
340 gsl_matrix *Y = pop_matrix (p, A68_TRUE);
341 MATH_RTE (p, Y == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
342 unt My = SIZE1 (Y), Ny = SIZE2 (Y);
343 // X data matrix, samples in rows, features in columns
344 gsl_matrix *X = pop_matrix (p, A68_TRUE);
345 MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
346 unt Mx = SIZE1 (X), Nx = SIZE2 (X);
347 // Catch wrong calls.
348 MATH_RTE (p, My == 0 || Ny == 0, M_ROW_ROW_REAL, "invalid column vector F");
349 MATH_RTE (p, Mx == 0 || Nx == 0, M_ROW_ROW_REAL, "invalid data matrix E");
350 MATH_RTE (p, Mx != My, M_ROW_ROW_REAL, "rows in F must match columns in E");
351 MATH_RTE (p, Ny != 1, M_ROW_ROW_REAL, "F must be a column vector");
352 MATH_RTE (p, VALUE (&lim) < 0 || VALUE (&lim) > 1.0, M_REAL, "invalid relative length");
353 // Keep X column-mean-centered.
354 gsl_matrix *C = column_mean (X);
355 gsl_vector *Sv = NO_REAL_VECTOR;
356 gsl_matrix *eigens = pca_svd (C, &Sv);
357 // Dimension reduction.
358 int Nk = (VALUE (&Nc) == 0 ? SIZE (Sv) : MIN (SIZE (Sv), VALUE (&Nc)));
359 if (VALUE (&lim) <= 0) {
360 VALUE (&lim) = SMALL_EIGEN;
361 }
362 // Too short eigenvectors wreak havoc.
363 REAL_T Sv0 = gsl_vector_get (Sv, 0);
364 for (int k = 1; k < SIZE (Sv); k++) {
365 if (gsl_vector_get (Sv, k) / Sv0 < VALUE (&lim)) {
366 if (k + 1 < Nk) {
367 Nk = k + 1;
368 }
369 }
370 }
371 gsl_matrix *reduced = left_columns (p, eigens, Nk);
372 // Compute projected set = X * reduced.
373 gsl_matrix *proj = NO_REAL_MATRIX;
374 a68_dgemm (SELF, SELF, 1.0, X, reduced, 0.0, &proj);
375 // Compute β = reduced * P⁻¹ * Y.
376 gsl_matrix *mpinv = NO_REAL_MATRIX;
377 compute_pseudo_inverse (p, &mpinv, proj, SMALL_EIGEN);
378 gsl_matrix *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
379 a68_dgemm (SELF, SELF, 1.0, reduced, mpinv, 0.0, &z);
380 a68_dgemm (SELF, SELF, 1.0, z, Y, 0.0, &beta);
381 // Yield results.
382 if (!IS_NIL (singulars)) {
383 *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
384 }
385 push_matrix (p, beta);
386 (void) gsl_set_error_handler (save_handler);
387 // Clean up.
388 a68_matrix_free (eigens);
389 a68_matrix_free (reduced);
390 a68_matrix_free (X);
391 a68_matrix_free (Y);
392 a68_matrix_free (C);
393 a68_vector_free (Sv);
394 a68_matrix_free (z);
395 a68_matrix_free (mpinv);
396 a68_matrix_free (proj);
397 a68_matrix_free (beta);
398 }
399
400 //! @brief PROC ols = ([, ] REAL, [, ] REAL) [, ] REAL
401
402 void genie_matrix_ols (NODE_T *p)
403 {
404 // TLS relates to PLS as OLS relates to PCR.
405 // Note that X is an MxN matrix and Y, β are Nx1 column vectors.
406 // OLS can implemented (inefficiently) as PCR with maximum number of singular values:
407 // PUSH_VALUE (p, 0.0, A68_REAL);
408 // PUSH_VALUE (p, 0, A68_INT);
409 // genie_matrix_pcr (p);
410 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
411 // X and Y matrices.
412 gsl_matrix *Y = pop_matrix (p, A68_TRUE);
413 gsl_matrix *X = pop_matrix (p, A68_TRUE);
414 // Compute β = X⁻¹ * Y.
415 gsl_matrix *mpinv = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
416 compute_pseudo_inverse (p, &mpinv, X, SMALL_EIGEN);
417 a68_dgemm (SELF, SELF, 1.0, X, Y, 0.0, &beta);
418 // Yield results.
419 push_matrix (p, beta);
420 (void) gsl_set_error_handler (save_handler);
421 a68_matrix_free (beta);
422 a68_matrix_free (mpinv);
423 a68_matrix_free (X);
424 a68_matrix_free (Y);
425 }
426
427 //! @brief PROC pls1 = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
428
429 void genie_matrix_pls1 (NODE_T *p)
430 {
431 // PLS decomposes X and Y data concurrently as
432 // X = T Pᵀ + E
433 // Y = U Qᵀ + F
434 // PLS1 is a widely used algorithm appropriate for the vector Y case.
435 // This procedure computes PLS1 with SVD solver for β,
436 // by a NIPALS algorithm with orthonormal scores and loadings.
437 // See Ulf Indahl, Journal of Chemometrics 28(3) 168-180 [2014].
438 // Note that E is an MxN matrix and F, β are Nx1 column vectors.
439 // This for consistency with PLS2.
440 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
441 // Pop arguments.
442 // 'lim' is minimum relative length to first eigenvector.
443 A68_REAL lim;
444 POP_OBJECT (p, &lim, A68_REAL);
445 // 'Nc' is maximum number of eigenvectors.
446 A68_INT Nc;
447 POP_OBJECT (p, &Nc, A68_INT);
448 A68_REF singulars;
449 POP_REF (p, &singulars);
450 // X and Y matrices.
451 gsl_matrix *F = pop_matrix (p, A68_TRUE);
452 gsl_matrix *E = pop_matrix (p, A68_TRUE);
453 // Catch wrong calls.
454 unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
455 MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
456 MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
457 MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
458 MATH_RTE (p, Nf != 1, M_ROW_ROW_REAL, "F must be a column vector");
459 MATH_RTE (p, VALUE (&lim) < 0 || VALUE (&lim) > 1.0, M_REAL, "invalid relative length");
460 // Sensible defaults.
461 int Nk = (VALUE (&Nc) == 0 ? MIN (Ne, Mf) : MIN (Mf, VALUE (&Nc)));
462 if (VALUE (&lim) <= 0) {
463 VALUE (&lim) = SMALL_EIGEN;
464 }
465 // Decompose E and F.
466 gsl_matrix *EIGEN = NO_REAL_MATRIX, *nE = NO_REAL_MATRIX, *nF = NO_REAL_MATRIX;
467 gsl_vector *Sv = gsl_vector_calloc (Nk);
468 BOOL_T siga = A68_TRUE;
469 gsl_matrix *eigens = NO_REAL_MATRIX, *lat = NO_REAL_MATRIX, *pl = NO_REAL_MATRIX, *ql = NO_REAL_MATRIX;
470 for (int k = 0; k < Nk && siga; k ++) {
471 // E weight from E, F covariance.
472 // eigens = Eᵀ * f / | Eᵀ * f |
473 a68_dgemm (FLIP, SELF, 1.0, E, F, 0.0, &eigens);
474 REAL_T norm = matrix_norm (eigens);
475 if (k > 0 && (norm / gsl_vector_get (Sv, 0)) < VALUE (&lim)) {
476 Nk = k;
477 siga = A68_FALSE;
478 } else {
479 ASSERT_GSL (gsl_matrix_scale (eigens, 1.0 / norm));
480 gsl_vector_set (Sv, k, norm);
481 // Compute latent variable.
482 // lat = E * eigens / | E * eigens |
483 a68_dgemm (SELF, SELF, 1.0, E, eigens, 0.0, &lat);
484 norm = matrix_norm (lat);
485 ASSERT_GSL (gsl_matrix_scale (lat, 1.0 / norm));
486 // Deflate E and F, remove latent variable from both.
487 // pl = Eᵀ * lat; E -= lat * plᵀ and ql = Fᵀ * lat; F -= lat * qlᵀ
488 a68_dgemm (FLIP, SELF, 1.0, E, lat, 0.0, &pl);
489 a68_dgemm (SELF, FLIP, -1.0, lat, pl, 1.0, &E);
490 a68_dgemm (FLIP, SELF, 1.0, F, lat, 0.0, &ql);
491 a68_dgemm (SELF, FLIP, -1.0, lat, ql, 1.0, &F);
492 // Build matrices.
493 EIGEN = mat_before_ab (p, EIGEN, eigens);
494 nE = mat_before_ab (p, nE, pl); // P
495 nF = mat_over_ab (p, nF, ql); // Qᵀ
496 }
497 }
498 // Projection of original data = Eᵀ * EIGEN
499 gsl_matrix *nP = NO_REAL_MATRIX;
500 a68_dgemm (FLIP, SELF, 1.0, nE, EIGEN, 0.0, &nP);
501 // Compute β = EIGEN * nP⁻¹ * nF
502 gsl_matrix *mpinv = NO_REAL_MATRIX, *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
503 compute_pseudo_inverse (p, &mpinv, nP, SMALL_EIGEN);
504 a68_dgemm (SELF, SELF, 1.0, EIGEN, mpinv, 0.0, &z);
505 a68_dgemm (SELF, SELF, 1.0, z, nF, 0.0, &beta);
506 // Yield results.
507 if (!IS_NIL (singulars)) {
508 gsl_vector *Svl = gsl_vector_calloc (Nk);
509 for (int k = 0; k < Nk; k++) {
510 gsl_vector_set (Svl, k, gsl_vector_get (Sv, k));
511 }
512 *DEREF (A68_REF, &singulars) = vector_to_row (p, Svl);
513 a68_vector_free (Svl);
514 }
515 push_matrix (p, beta);
516 (void) gsl_set_error_handler (save_handler);
517 // Garbage collector.
518 a68_matrix_free (E);
519 a68_matrix_free (F);
520 a68_matrix_free (eigens);
521 a68_matrix_free (lat);
522 a68_matrix_free (pl);
523 a68_matrix_free (ql);
524 a68_matrix_free (beta);
525 a68_matrix_free (EIGEN);
526 a68_matrix_free (nE);
527 a68_matrix_free (nF);
528 a68_matrix_free (nP);
529 a68_vector_free (Sv);
530 }
531
532 //! @brief PROC pls2 = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
533
534 void genie_matrix_pls2 (NODE_T *p)
535 {
536 // PLS decomposes X and Y data concurrently as
537 // X = T Pᵀ + E
538 // Y = U Qᵀ + F
539 // Generic orthodox NIPALS, following Hervé Abdi, University of Texas.
540 // "Partial Least Squares Regression and Projection on Latent Structure Regression",
541 // Computational Statistics [2010].
542 #define PLS_MAX_ITER 100
543 gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
544 // Pop arguments.
545 // 'lim' is minimum relative length to first eigenvector.
546 A68_REAL lim;
547 POP_OBJECT (p, &lim, A68_REAL);
548 // 'Nc' is maximum number of eigenvectors.
549 A68_INT Nc;
550 POP_OBJECT (p, &Nc, A68_INT);
551 A68_REF singulars;
552 POP_REF (p, &singulars);
553 // X and Y matrices.
554 gsl_matrix *F = pop_matrix (p, A68_TRUE);
555 gsl_matrix *E = pop_matrix (p, A68_TRUE);
556 // Catch wrong calls.
557 unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
558 MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
559 MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
560 MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
561 CHECK_INT_SHORTEN (p, VALUE (&Nc));
562 // Sensible defaults.
563 int Nk = (VALUE (&Nc) == 0 ? MIN (Ne, Mf) : MIN (Mf, VALUE (&Nc)));
564 if (VALUE (&lim) <= 0) {
565 VALUE (&lim) = SMALL_EIGEN;
566 }
567 // Initial vector u.
568 gsl_matrix *u = gsl_matrix_calloc (Mf, 1);
569 if (Nf == 1) { // Column vector, PLS1
570 for (int k = 0; k < Mf; k ++) {
571 gsl_matrix_set (u, k, 0, gsl_matrix_get (F, k, 0));
572 }
573 } else {
574 for (int k = 0; k < Mf; k ++) {
575 gsl_matrix_set (u, k, 0, a68_gauss_rand ());
576 }
577 }
578 // Decompose E and F jointly.
579 gsl_vector *Sv = gsl_vector_calloc (Nk), *dD = gsl_vector_calloc (Nk);
580 gsl_matrix *u0 = gsl_matrix_calloc (Mf, 1), *diff = gsl_matrix_calloc (Mf, 1);
581 gsl_matrix *eigens = NO_REAL_MATRIX;
582 gsl_matrix *t = NO_REAL_MATRIX, *b = NO_REAL_MATRIX, *c = NO_REAL_MATRIX;
583 gsl_matrix *pl = NO_REAL_MATRIX, *ql = NO_REAL_MATRIX, *nP = NO_REAL_MATRIX, *nC = NO_REAL_MATRIX;
584 BOOL_T siga = A68_TRUE;
585 for (int k = 0; k < Nk && siga; k ++) {
586 REAL_T norm_e, norm;
587 for (int j = 0; j < PLS_MAX_ITER; j ++) {
588 gsl_matrix_memcpy (u0, u);
589 // Compute X weight. Note that Eᵀ * u is of form [Indahl]
590 // Eᵀ * F * Fᵀ * E * w / |Eᵀ * F * Fᵀ * w| = (1 / lambda) (Fᵀ * E)ᵀ * (Fᵀ * E) * w,
591 // that is, w is an eigenvector of covariance matrix Fᵀ * E and
592 // longest eigenvector of symmetric matrix Eᵀ * F * Fᵀ * E.
593 a68_dgemm (FLIP, SELF, 1.0, E, u, 0.0, &eigens);
594 norm_e = matrix_norm (eigens);
595 ASSERT_GSL (gsl_matrix_scale (eigens, 1.0 / norm_e));
596 // X factor score.
597 a68_dgemm (SELF, SELF, 1.0, E, eigens, 0.0, &t);
598 norm = matrix_norm (t);
599 ASSERT_GSL (gsl_matrix_scale (t, 1.0 / norm));
600 // Y weight.
601 a68_dgemm (FLIP, SELF, 1.0, F, t, 0.0, &c);
602 norm = matrix_norm (c);
603 ASSERT_GSL (gsl_matrix_scale (c, 1.0 / norm));
604 // Y score.
605 a68_dgemm (SELF, SELF, 1.0, F, c, 0.0, &u);
606 gsl_matrix_memcpy (diff, u);
607 gsl_matrix_sub (diff, u0);
608 norm = matrix_norm (diff);
609 if (norm < SMALL_EIGEN) {
610 j = PLS_MAX_ITER;
611 }
612 }
613 if (k > 0 && (norm_e / gsl_vector_get (Sv, 0)) < VALUE (&lim)) {
614 Nk = k;
615 siga = A68_FALSE;
616 } else {
617 gsl_vector_set (Sv, k, norm_e);
618 // X factor loading and deflation.
619 a68_dgemm (FLIP, SELF, 1.0, E, t, 0.0, &pl);
620 norm = matrix_norm (t);
621 ASSERT_GSL (gsl_matrix_scale (pl, 1.0 / (norm * norm)));
622 a68_dgemm (SELF, FLIP, -1.0, t, pl, 1.0, &E);
623 // Y factor loading and deflation.
624 a68_dgemm (FLIP, SELF, 1.0, F, u, 0.0, &ql);
625 norm = matrix_norm (u);
626 ASSERT_GSL (gsl_matrix_scale (ql, 1.0 / (norm * norm)));
627 a68_dgemm (FLIP, SELF, 1.0, t, u, 0.0, &b);
628 a68_dgemm (SELF, FLIP, -gsl_matrix_get (b, 0, 0), t, ql, 1.0, &F);
629 // Build vector and matrices.
630 gsl_vector_set (dD, k, gsl_matrix_get (b, 0, 0));
631 nP = mat_before_ab (p, nP, pl); // P
632 nC = mat_before_ab (p, nC, c); // C
633 }
634 }
635 // Compute β = (Pᵀ)⁻¹ * D * Cᵀ
636 // Python diag function.
637 gsl_matrix *D = gsl_matrix_calloc (Nk, Nk);
638 for (int k = 0; k < Nk; k++) {
639 gsl_matrix_set (D, k, k, gsl_vector_get (dD, k));
640 }
641 gsl_matrix *mpinv = NO_REAL_MATRIX;
642 compute_pseudo_inverse (p, &mpinv, nP, SMALL_EIGEN);
643 gsl_matrix *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
644 a68_dgemm (FLIP, SELF, 1.0, mpinv, D, 0.0, &z);
645 a68_dgemm (SELF, FLIP, 1.0, z, nC, 0.0, &beta);
646 // Yield results.
647 if (!IS_NIL (singulars)) {
648 gsl_vector *Svl = gsl_vector_calloc (Nk);
649 for (int k = 0; k < Nk; k++) {
650 gsl_vector_set (Svl, k, gsl_vector_get (Sv, k));
651 }
652 *DEREF (A68_REF, &singulars) = vector_to_row (p, Svl);
653 a68_vector_free (Svl);
654 }
655 push_matrix (p, beta);
656 (void) gsl_set_error_handler (save_handler);
657 // Garbage collector.
658 a68_vector_free (dD);
659 a68_vector_free (Sv);
660 a68_matrix_free (b);
661 a68_matrix_free (beta);
662 a68_matrix_free (c);
663 a68_matrix_free (D);
664 a68_matrix_free (diff);
665 a68_matrix_free (eigens);
666 a68_matrix_free (nC);
667 a68_matrix_free (nP);
668 a68_matrix_free (mpinv);
669 a68_matrix_free (pl);
670 a68_matrix_free (ql);
671 a68_matrix_free (t);
672 a68_matrix_free (u);
673 a68_matrix_free (u0);
674 a68_matrix_free (z);
675 #undef PLS_MAX_ITER
676 }
677
678 //! @brief PROC tls = ([, ] REAL, [, ] REAL, REF [] REAL) [, ] REAL
679
680 void genie_matrix_tls (NODE_T *p)
681 {
682 // TLS relates to PLS as OLS relates to PCR.
683 // TLS is implemented as PLS1 with maximum number of singular values.
684 PUSH_VALUE (p, 0.0, A68_REAL);
685 PUSH_VALUE (p, 0, A68_INT);
686 genie_matrix_pls1 (p);
687 }
688
689 #endif
© 2002-2024 J.M. van der Veer (jmvdveer@xs4all.nl)
|