single-multivariate.c

     
   1  //! @file single-multivariate.c
   2  //! @author J. Marcel van der Veer
   3  
   4  //! @section Copyright
   5  //!
   6  //! This file is part of Algol68G - an Algol 68 compiler-interpreter.
   7  //! Copyright 2001-2024 J. Marcel van der Veer [algol68g@xs4all.nl].
   8  
   9  //! @section License
  10  //!
  11  //! This program is free software; you can redistribute it and/or modify it 
  12  //! under the terms of the GNU General Public License as published by the 
  13  //! Free Software Foundation; either version 3 of the License, or 
  14  //! (at your option) any later version.
  15  //!
  16  //! This program is distributed in the hope that it will be useful, but 
  17  //! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
  18  //! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 
  19  //! more details. You should have received a copy of the GNU General Public 
  20  //! License along with this program. If not, see [http://www.gnu.org/licenses/].
  21  
  22  //! @section Synopsis
  23  //!
  24  //! REAL multivariate regression.
  25  
  26  // This code implements least square regression methods:
  27  //   OLS - Ordinary Least Squares
  28  //   PCR - Principle Component Regression, OLS after dimension reduction
  29  //   TLS - Total Least Squares
  30  //   PLS - Partial Least Squares, TLS after dimension reduction
  31  
  32  #include "a68g.h"
  33  
  34  #if defined (HAVE_GSL)
  35  
  36  #include "a68g-torrix.h"
  37  #include "a68g-prelude-gsl.h"
  38  
  39  #define SMALL_EIGEN 1e-9
  40  
  41  //! @brief column-centered data matrix.
  42  
  43  gsl_matrix *column_mean (gsl_matrix *DATA)
  44  {
  45  // A is MxN; M samples x N features.
  46    unt M = SIZE1 (DATA), N = SIZE2 (DATA);
  47    gsl_matrix *C = gsl_matrix_calloc (M, N);
  48    for (int i = 0; i < N; i++) {
  49      DOUBLE_T sum = 0;
  50      for (int j = 0; j < M; j++) {
  51        sum += gsl_matrix_get (DATA, j, i);
  52      }
  53      REAL_T mean = sum / M;
  54      for (int j = 0; j < M; j++) {
  55        gsl_matrix_set (C, j, i, gsl_matrix_get (DATA, j, i) - mean);
  56      }
  57    }
  58    return C;
  59  }
  60  
  61  //! @brief left columns from matrix.
  62  
  63  static gsl_matrix *left_columns (NODE_T *p, gsl_matrix *X, int cols)
  64  {
  65    MATH_RTE (p, cols < 0, M_INT, "invalid number of columns");
  66    unt M = SIZE1 (X), N = SIZE2 (X);
  67    if (cols == 0 || cols > N) {
  68      cols = N;
  69    }
  70    gsl_matrix *Y = gsl_matrix_calloc (M, cols);
  71    for (int i = 0; i < M; i++) {
  72      for (int j = 0; j < cols; j++) {
  73        gsl_matrix_set (Y, i, j, gsl_matrix_get (X, i, j));
  74      }
  75    }
  76    return Y;
  77  }
  78  
  79  //! @brief Compute Moore-Penrose pseudo inverse.
  80  
  81  void compute_pseudo_inverse (NODE_T *p, gsl_matrix **mpinv, gsl_matrix *X, REAL_T lim)
  82  {
  83  // The Moore-Penrose pseudo inverse gives a least-square approximate solution
  84  // for a system of linear equations not having an exact solution.
  85  // Multivariate statistics is a well known application.
  86    MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
  87    MATH_RTE (p, lim < 0, M_REAL, "invalid limit");
  88    unt M = SIZE1 (X), N = SIZE2 (X);  
  89  // GSL only handles M >= N, transpose commutes with pseudo inverse.
  90    gsl_matrix *U;
  91    BOOL_T transpose = (M < N);
  92    if (transpose) {
  93      M = SIZE2 (X); N = SIZE1 (X);  
  94      U = gsl_matrix_calloc (M, N);
  95      gsl_matrix_transpose_memcpy (U, X);
  96    } else {
  97      U = gsl_matrix_calloc (M, N);
  98      gsl_matrix_memcpy (U, X);
  99    }
 100  // A = USVᵀ by Jacobi, more precise than Golub-Reinsch.
 101  // The GSL routine yields V, not Vᵀ. U is decomposed in place.
 102    gsl_matrix *V = gsl_matrix_calloc (N, N);
 103    gsl_vector *Sv = gsl_vector_calloc (N);
 104    ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, Sv));
 105  // Compute S⁻¹. 
 106  // Very small eigenvalues wreak havoc on a pseudo inverse.
 107  // So diagonal elements < 'lim * largest element' are set to zero.
 108  // Python NumPy default is 1e-15, but assumes a Hermitian matrix.
 109  // SVD gives singular values sorted in descending order.
 110    REAL_T x0 = gsl_vector_get (Sv, 0);
 111    MATH_RTE (p, x0 == 0, M_ROW_ROW_REAL, "zero eigenvalue or singular value");
 112    gsl_matrix *S_inv = gsl_matrix_calloc (N, M);
 113    gsl_matrix_set (S_inv, 0, 0, 1 / x0);
 114    for (int i = 1; i < N; i++) {
 115      REAL_T x = gsl_vector_get (Sv, i);
 116      if ((x / x0) > lim) {
 117        gsl_matrix_set (S_inv, i, i, 1 / x);
 118      } else {
 119        gsl_matrix_set (S_inv, i, i, 0);
 120      }
 121    }
 122    a68_vector_free (Sv);
 123  // GSL SVD yields thin SVD - pad with zeros.
 124    gsl_matrix *Uf = gsl_matrix_calloc (M, M);
 125    for (int i = 0; i < M; i++) {
 126      for (int j = 0; j < N; j++) {
 127        gsl_matrix_set (Uf, i, j, gsl_matrix_get (U, i, j));
 128      }
 129    }
 130  // Compute pseudo inverse A⁻¹ = VS⁻¹Uᵀ. 
 131    gsl_matrix *VS_inv = NO_REAL_MATRIX, *X_inv = NO_REAL_MATRIX;
 132    a68_dgemm (SELF, SELF, 1, V, S_inv, 0, &VS_inv);
 133    a68_dgemm (SELF, FLIP, 1, VS_inv, Uf, 0, &X_inv);
 134  // Compose result.
 135    if (transpose) {
 136      (*mpinv) = gsl_matrix_calloc (M, N);
 137      gsl_matrix_transpose_memcpy ((*mpinv), X_inv);
 138    } else {
 139      (*mpinv) = gsl_matrix_calloc (N, M);
 140      gsl_matrix_memcpy ((*mpinv), X_inv);
 141    }
 142  // Clean up.
 143    a68_matrix_free (S_inv);
 144    a68_matrix_free (U);
 145    a68_matrix_free (Uf);
 146    a68_matrix_free (V);
 147    a68_matrix_free (VS_inv);
 148    a68_matrix_free (X_inv);
 149  }
 150  
 151  //! @brief Compute Principal Component Analysis by SVD.
 152  
 153  gsl_matrix *pca_svd (gsl_matrix *X, gsl_vector ** Sv)
 154  {
 155  // In PCA, SVD is closely related to eigenvector decomposition of the covariance matrix:
 156  // If Cov = XᵀX = VLVᵀ and X = USVᵀ then XᵀX = VSUᵀUSVᵀ = VS²Vᵀ, hence L = S².
 157  // M samples, N features.
 158    unt M = SIZE1 (X), N = SIZE2 (X);
 159  // GSL does thin SVD, only handles M >= N, overdetermined systems.    
 160    BOOL_T transpose = (M < N);
 161    gsl_matrix *U = NO_REAL_MATRIX;
 162    if (transpose) {
 163  // Xᵀ = VSUᵀ
 164      M = SIZE2 (X); N = SIZE1 (X);  
 165      U = gsl_matrix_calloc (M, N);
 166      gsl_matrix_transpose_memcpy (U, X);
 167    } else {
 168  // X = USVᵀ
 169      U = gsl_matrix_calloc (M, N);
 170      gsl_matrix_memcpy (U, X);
 171    }
 172  // X = USVᵀ by Jacobi, more precise than Golub-Reinsch.
 173  // GSL routine yields V, not Vᵀ, U develops in place.
 174    gsl_matrix *V = gsl_matrix_calloc (N, N);
 175    (*Sv) = gsl_vector_calloc (N);
 176    ASSERT_GSL (gsl_linalg_SV_decomp_jacobi (U, V, (*Sv)));
 177  // Return singular values, if required.
 178    gsl_matrix *eigens = gsl_matrix_calloc (M, N);
 179    if (transpose) {
 180      ASSERT_GSL (gsl_matrix_memcpy (eigens, U));
 181    } else {
 182      ASSERT_GSL (gsl_matrix_memcpy (eigens, V));
 183    }
 184    a68_matrix_free (U);
 185    a68_matrix_free (V);
 186    return eigens;
 187  }
 188  
 189  //! @brief PROC pseudo inv = ([, ] REAL, REAL) [, ] REAL
 190  
 191  void genie_matrix_pinv_lim (NODE_T * p)
 192  {
 193  // Compute the Moore-Penrose pseudo inverse of a MxN matrix.
 194  // G. Strang. "Linear Algebra and its applications", 2nd edition.
 195  // Academic Press [1980].
 196    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 197    A68_REAL lim;
 198    POP_OBJECT (p, &lim, A68_REAL);
 199    gsl_matrix *X = pop_matrix (p, A68_TRUE), *mpinv = NO_REAL_MATRIX;
 200    compute_pseudo_inverse (p, &mpinv, X, VALUE (&lim));
 201    push_matrix (p, mpinv);
 202    a68_matrix_free (mpinv);
 203    a68_matrix_free (X);
 204    (void) gsl_set_error_handler (save_handler);
 205  }
 206  
 207  //! @brief OP PINV = ([, ] REAL) [, ] REAL
 208  
 209  void genie_matrix_pinv (NODE_T * p)
 210  {
 211    PUSH_VALUE (p, SMALL_EIGEN, A68_REAL);
 212    genie_matrix_pinv_lim (p);
 213  }
 214  
 215  //! @brief OP MEAN = ([, ] REAL) [, ] REAL
 216  
 217  void genie_matrix_column_mean (NODE_T *p)
 218  {
 219    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 220    gsl_matrix *Z = pop_matrix (p, A68_TRUE); 
 221    unt M = SIZE1 (Z), N = SIZE2 (Z);
 222    gsl_matrix *A = gsl_matrix_calloc (M, N);
 223    for (int i = 0; i < N; i++) {
 224      DOUBLE_T sum = 0;
 225      for (int j = 0; j < M; j++) {
 226        sum += gsl_matrix_get (Z, j, i);
 227      }
 228      DOUBLE_T mean = sum / M;
 229      for (int j = 0; j < M; j++) {
 230        gsl_matrix_set (A, j, i, mean);
 231      }
 232    }
 233    push_matrix (p, A);
 234    a68_matrix_free (A);
 235    a68_matrix_free (Z);
 236    (void) gsl_set_error_handler (save_handler);
 237  }
 238  
 239  //! @brief PROC left columns = ([, ] REAL, INT) [, ] REAL
 240  
 241  void genie_left_columns (NODE_T *p)
 242  {
 243    A68_INT k;
 244    POP_OBJECT (p, &k, A68_INT);
 245    gsl_matrix *X = pop_matrix (p, A68_TRUE); 
 246    gsl_matrix *Y = left_columns (p, X, VALUE (&k));
 247    push_matrix (p, Y);
 248    a68_matrix_free (X);
 249    a68_matrix_free (Y);
 250  }
 251  
 252  //! @brief PROC pca cv = ([, ] REAL, REF [] REAL) [, ] REAL
 253  
 254  void genie_matrix_pca_cv (NODE_T * p)
 255  {
 256  // Principal component analysis of a MxN matrix from a covariance matrix.
 257  // Forming the covariance matrix squares the condition number.
 258  // Hence this routine might loose more significant digits than SVD.
 259  // On the other hand, using PCA one looks for dominant eigenvectors.
 260    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 261    A68_REF singulars;
 262    POP_REF (p, &singulars);
 263    gsl_matrix *X = pop_matrix (p, A68_TRUE);
 264    MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
 265  // M samples, N features.
 266    unt M = SIZE1 (X), N = SIZE2 (X);
 267  // Keep X column-mean-centered.
 268    gsl_matrix *C = column_mean (X);
 269  // Covariance matrix, M samples: Cov = XᵀX.
 270    M = MAX (M, N);
 271    gsl_matrix *CV = NO_REAL_MATRIX;
 272    a68_dgemm (FLIP, SELF, 1, C, C, 0, &CV);
 273  // Compute and sort eigenvectors.
 274    gsl_vector *Sv = gsl_vector_calloc (M);
 275    gsl_matrix *eigens = gsl_matrix_calloc (M, M);
 276    gsl_eigen_symmv_workspace *W = gsl_eigen_symmv_alloc (M);
 277    ASSERT_GSL (gsl_eigen_symmv (CV, Sv, eigens, W));
 278    ASSERT_GSL (gsl_eigen_symmv_sort (Sv, eigens, GSL_EIGEN_SORT_ABS_DESC));
 279  // Yield results.
 280    if (!IS_NIL (singulars)) {
 281      *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
 282    }
 283    push_matrix (p, eigens);
 284    (void) gsl_set_error_handler (save_handler);
 285  // Garbage collector
 286    a68_matrix_free (eigens);
 287    a68_matrix_free (X);
 288    gsl_eigen_symmv_free (W);
 289    a68_matrix_free (C);
 290    a68_matrix_free (CV);
 291    a68_vector_free (Sv);
 292  }
 293  
 294  //! @brief PROC pca svd = ([, ] REAL, REF [] REAL) [, ] REAL
 295  
 296  void genie_matrix_pca_svd (NODE_T * p)
 297  {
 298  // Principal component analysis of a MxN matrix by Jacobi SVD.
 299    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 300    A68_REF singulars;
 301    POP_REF (p, &singulars);
 302  // X data matrix, samples in rows, features in columns
 303    gsl_matrix *X = pop_matrix (p, A68_TRUE);
 304    MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
 305  // Keep X column-mean-centered.
 306    gsl_matrix *C = column_mean (X);
 307    gsl_vector *Sv = NO_REAL_VECTOR;
 308    gsl_matrix *eigens = pca_svd (C, &Sv);
 309  // Yield results.
 310    if (!IS_NIL (singulars)) {
 311      *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
 312    }
 313    push_matrix (p, eigens);
 314    (void) gsl_set_error_handler (save_handler);
 315  // Clean up.
 316    a68_matrix_free (eigens);
 317    a68_matrix_free (X);
 318    a68_matrix_free (C);
 319    if (Sv != NO_REAL_VECTOR) {
 320      a68_vector_free (Sv);
 321    }
 322  }
 323  
 324  //! @brief PROC pcr = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
 325  
 326  void genie_matrix_pcr (NODE_T * p)
 327  {
 328  // Principal Component Regression of a MxN matrix by Jacobi SVD.
 329    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 330  // Pop arguments.
 331  // 'lim' is minimum relative length to first eigenvector.
 332    A68_REAL lim;
 333    POP_OBJECT (p, &lim, A68_REAL); 
 334  // 'Nc' is maximum number of eigenvectors.
 335    A68_INT Nc;
 336    POP_OBJECT (p, &Nc, A68_INT); 
 337    A68_REF singulars;
 338    POP_REF (p, &singulars);
 339  // Y data vector, features in a single column
 340    gsl_matrix *Y = pop_matrix (p, A68_TRUE);
 341    MATH_RTE (p, Y == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
 342    unt My = SIZE1 (Y), Ny = SIZE2 (Y);
 343  // X data matrix, samples in rows, features in columns
 344    gsl_matrix *X = pop_matrix (p, A68_TRUE);
 345    MATH_RTE (p, X == NO_REAL_MATRIX, M_ROW_ROW_REAL, "empty data matrix");
 346    unt Mx = SIZE1 (X), Nx = SIZE2 (X);
 347  // Catch wrong calls.
 348    MATH_RTE (p, My == 0 || Ny == 0, M_ROW_ROW_REAL, "invalid column vector F");
 349    MATH_RTE (p, Mx == 0 || Nx == 0, M_ROW_ROW_REAL, "invalid data matrix E");
 350    MATH_RTE (p, Mx != My, M_ROW_ROW_REAL, "rows in F must match columns in E");
 351    MATH_RTE (p, Ny != 1, M_ROW_ROW_REAL, "F must be a column vector");
 352    MATH_RTE (p, VALUE (&lim) < 0 || VALUE (&lim) > 1.0, M_REAL, "invalid relative length");
 353  // Keep X column-mean-centered.
 354    gsl_matrix *C = column_mean (X);
 355    gsl_vector *Sv = NO_REAL_VECTOR;
 356    gsl_matrix *eigens = pca_svd (C, &Sv);
 357  // Dimension reduction.
 358    int Nk = (VALUE (&Nc) == 0 ? SIZE (Sv) : MIN (SIZE (Sv), VALUE (&Nc)));
 359    if (VALUE (&lim) <= 0) {
 360      VALUE (&lim) = SMALL_EIGEN;
 361    }
 362  // Too short eigenvectors wreak havoc.
 363    REAL_T Sv0 = gsl_vector_get (Sv, 0);
 364    for (int k = 1; k < SIZE (Sv); k++) {
 365      if (gsl_vector_get (Sv, k) / Sv0 < VALUE (&lim)) {
 366        if (k + 1 < Nk) {
 367          Nk = k + 1;
 368        }
 369      }
 370    }
 371    gsl_matrix *reduced = left_columns (p, eigens, Nk);
 372  // Compute projected set = X * reduced.
 373    gsl_matrix *proj = NO_REAL_MATRIX;
 374    a68_dgemm (SELF, SELF, 1.0, X, reduced, 0.0, &proj);
 375  // Compute β = reduced * P⁻¹ * Y.
 376    gsl_matrix *mpinv = NO_REAL_MATRIX;
 377    compute_pseudo_inverse (p, &mpinv, proj, SMALL_EIGEN);
 378    gsl_matrix *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
 379    a68_dgemm (SELF, SELF, 1.0, reduced, mpinv, 0.0, &z);
 380    a68_dgemm (SELF, SELF, 1.0, z, Y, 0.0, &beta);
 381  // Yield results.
 382    if (!IS_NIL (singulars)) {
 383      *DEREF (A68_REF, &singulars) = vector_to_row (p, Sv);
 384    }
 385    push_matrix (p, beta);
 386    (void) gsl_set_error_handler (save_handler);
 387  // Clean up.
 388    a68_matrix_free (eigens);
 389    a68_matrix_free (reduced);
 390    a68_matrix_free (X);
 391    a68_matrix_free (Y);
 392    a68_matrix_free (C);
 393    a68_vector_free (Sv);
 394    a68_matrix_free (z);
 395    a68_matrix_free (mpinv);
 396    a68_matrix_free (proj);
 397    a68_matrix_free (beta);
 398  }
 399  
 400  //! @brief PROC ols = ([, ] REAL, [, ] REAL) [, ] REAL
 401  
 402  void genie_matrix_ols (NODE_T *p)
 403  {
 404  // TLS relates to PLS as OLS relates to PCR.
 405  // Note that X is an MxN matrix and Y, β are Nx1 column vectors.
 406  // OLS can implemented (inefficiently) as PCR with maximum number of singular values:
 407  //  PUSH_VALUE (p, 0.0, A68_REAL);
 408  //  PUSH_VALUE (p, 0, A68_INT);
 409  //  genie_matrix_pcr (p);
 410    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 411  // X and Y matrices.
 412    gsl_matrix *Y = pop_matrix (p, A68_TRUE);
 413    gsl_matrix *X = pop_matrix (p, A68_TRUE);
 414  // Compute β = X⁻¹ * Y.
 415    gsl_matrix *mpinv = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
 416    compute_pseudo_inverse (p, &mpinv, X, SMALL_EIGEN);
 417    a68_dgemm (SELF, SELF, 1.0, X, Y, 0.0, &beta);
 418  // Yield results.
 419    push_matrix (p, beta);
 420    (void) gsl_set_error_handler (save_handler);
 421    a68_matrix_free (beta);
 422    a68_matrix_free (mpinv);
 423    a68_matrix_free (X);
 424    a68_matrix_free (Y);
 425  }
 426  
 427  //! @brief PROC pls1 = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
 428  
 429  void genie_matrix_pls1 (NODE_T *p)
 430  {
 431  // PLS decomposes X and Y data concurrently as
 432  // X = T Pᵀ + E
 433  // Y = U Qᵀ + F
 434  // PLS1 is a widely used algorithm appropriate for the vector Y case.
 435  // This procedure computes PLS1 with SVD solver for β,
 436  // by a NIPALS algorithm with orthonormal scores and loadings.
 437  // See Ulf Indahl, Journal of Chemometrics 28(3) 168-180 [2014].
 438  // Note that E is an MxN matrix and F, β are Nx1 column vectors.
 439  // This for consistency with PLS2.
 440    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 441  // Pop arguments.
 442  // 'lim' is minimum relative length to first eigenvector.
 443    A68_REAL lim;
 444    POP_OBJECT (p, &lim, A68_REAL); 
 445  // 'Nc' is maximum number of eigenvectors.
 446    A68_INT Nc;
 447    POP_OBJECT (p, &Nc, A68_INT); 
 448    A68_REF singulars;
 449    POP_REF (p, &singulars);
 450  // X and Y matrices.
 451    gsl_matrix *F = pop_matrix (p, A68_TRUE);
 452    gsl_matrix *E = pop_matrix (p, A68_TRUE);
 453  // Catch wrong calls.
 454    unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
 455    MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
 456    MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
 457    MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
 458    MATH_RTE (p, Nf != 1, M_ROW_ROW_REAL, "F must be a column vector");
 459    MATH_RTE (p, VALUE (&lim) < 0 || VALUE (&lim) > 1.0, M_REAL, "invalid relative length");
 460  // Sensible defaults.
 461    int Nk = (VALUE (&Nc) == 0 ? MIN (Ne, Mf) : MIN (Mf, VALUE (&Nc)));
 462    if (VALUE (&lim) <= 0) {
 463      VALUE (&lim) = SMALL_EIGEN;
 464    }
 465  // Decompose E and F.
 466    gsl_matrix *EIGEN = NO_REAL_MATRIX, *nE = NO_REAL_MATRIX, *nF = NO_REAL_MATRIX; 
 467    gsl_vector *Sv = gsl_vector_calloc (Nk);
 468    BOOL_T siga = A68_TRUE;
 469    gsl_matrix *eigens = NO_REAL_MATRIX, *lat = NO_REAL_MATRIX, *pl = NO_REAL_MATRIX, *ql = NO_REAL_MATRIX;
 470    for (int k = 0; k < Nk && siga; k ++) {
 471  // E weight from E, F covariance.
 472  // eigens = Eᵀ * f / | Eᵀ * f |
 473      a68_dgemm (FLIP, SELF, 1.0, E, F, 0.0, &eigens);
 474      REAL_T norm = matrix_norm (eigens);
 475      if (k > 0 && (norm / gsl_vector_get (Sv, 0)) < VALUE (&lim)) {
 476        Nk = k;
 477        siga = A68_FALSE;
 478      } else {
 479        ASSERT_GSL (gsl_matrix_scale (eigens, 1.0 / norm));
 480        gsl_vector_set (Sv, k, norm);
 481  // Compute latent variable.
 482  // lat = E * eigens / | E * eigens |
 483        a68_dgemm (SELF, SELF, 1.0, E, eigens, 0.0, &lat);
 484        norm = matrix_norm (lat);
 485        ASSERT_GSL (gsl_matrix_scale (lat, 1.0 / norm));
 486  // Deflate E and F, remove latent variable from both.
 487  // pl = Eᵀ * lat; E -= lat * plᵀ and ql = Fᵀ * lat; F -= lat * qlᵀ
 488        a68_dgemm (FLIP, SELF, 1.0, E, lat, 0.0, &pl);
 489        a68_dgemm (SELF, FLIP, -1.0, lat, pl, 1.0, &E);
 490        a68_dgemm (FLIP, SELF, 1.0, F, lat, 0.0, &ql);
 491        a68_dgemm (SELF, FLIP, -1.0, lat, ql, 1.0, &F);
 492  // Build matrices.
 493        EIGEN = mat_before_ab (p, EIGEN, eigens);
 494        nE = mat_before_ab (p, nE, pl); // P
 495        nF = mat_over_ab (p, nF, ql);   // Qᵀ
 496      }
 497    }
 498  // Projection of original data = Eᵀ * EIGEN
 499    gsl_matrix *nP = NO_REAL_MATRIX;
 500    a68_dgemm (FLIP, SELF, 1.0, nE, EIGEN, 0.0, &nP);
 501  // Compute β = EIGEN * nP⁻¹ * nF
 502    gsl_matrix *mpinv = NO_REAL_MATRIX, *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
 503    compute_pseudo_inverse (p, &mpinv, nP, SMALL_EIGEN);
 504    a68_dgemm (SELF, SELF, 1.0, EIGEN, mpinv, 0.0, &z);
 505    a68_dgemm (SELF, SELF, 1.0, z, nF, 0.0, &beta);
 506  // Yield results.
 507    if (!IS_NIL (singulars)) {
 508      gsl_vector *Svl = gsl_vector_calloc (Nk);
 509      for (int k = 0; k < Nk; k++) {
 510        gsl_vector_set (Svl, k, gsl_vector_get (Sv, k));
 511      }
 512      *DEREF (A68_REF, &singulars) = vector_to_row (p, Svl);
 513      a68_vector_free (Svl);
 514    }
 515    push_matrix (p, beta);
 516    (void) gsl_set_error_handler (save_handler);
 517  // Garbage collector.
 518    a68_matrix_free (E);
 519    a68_matrix_free (F);
 520    a68_matrix_free (eigens);
 521    a68_matrix_free (lat);
 522    a68_matrix_free (pl);
 523    a68_matrix_free (ql);
 524    a68_matrix_free (beta);
 525    a68_matrix_free (EIGEN);
 526    a68_matrix_free (nE);
 527    a68_matrix_free (nF);
 528    a68_matrix_free (nP);
 529    a68_vector_free (Sv);
 530  }
 531  
 532  //! @brief PROC pls2 = ([, ] REAL, [, ] REAL, REF [] REAL, INT, REAL) [, ] REAL
 533  
 534  void genie_matrix_pls2 (NODE_T *p)
 535  {
 536  // PLS decomposes X and Y data concurrently as
 537  // X = T Pᵀ + E
 538  // Y = U Qᵀ + F
 539  // Generic orthodox NIPALS, following Hervé Abdi, University of Texas.
 540  // "Partial Least Squares Regression and Projection on Latent Structure Regression",
 541  // Computational Statistics [2010].
 542  #define PLS_MAX_ITER 100
 543    gsl_error_handler_t *save_handler = gsl_set_error_handler (torrix_error_handler);
 544  // Pop arguments.
 545  // 'lim' is minimum relative length to first eigenvector.
 546    A68_REAL lim;
 547    POP_OBJECT (p, &lim, A68_REAL); 
 548  // 'Nc' is maximum number of eigenvectors.
 549    A68_INT Nc;
 550    POP_OBJECT (p, &Nc, A68_INT); 
 551    A68_REF singulars;
 552    POP_REF (p, &singulars);
 553  // X and Y matrices.
 554    gsl_matrix *F = pop_matrix (p, A68_TRUE);
 555    gsl_matrix *E = pop_matrix (p, A68_TRUE);
 556  // Catch wrong calls.
 557    unt Me = SIZE1 (E), Ne = SIZE2 (E), Mf = SIZE1 (F), Nf = SIZE2 (F);
 558    MATH_RTE (p, Mf == 0 || Nf == 0, M_ROW_ROW_REAL, "invalid column vector F");
 559    MATH_RTE (p, Me == 0 || Ne == 0, M_ROW_ROW_REAL, "invalid data matrix E");
 560    MATH_RTE (p, Me != Mf, M_ROW_ROW_REAL, "rows in F must match columns in E");
 561    CHECK_INT_SHORTEN (p, VALUE (&Nc));
 562  // Sensible defaults.
 563    int Nk = (VALUE (&Nc) == 0 ? MIN (Ne, Mf) : MIN (Mf, VALUE (&Nc)));
 564    if (VALUE (&lim) <= 0) {
 565      VALUE (&lim) = SMALL_EIGEN;
 566    }
 567  // Initial vector u.
 568    gsl_matrix *u = gsl_matrix_calloc (Mf, 1);
 569    if (Nf == 1) { // Column vector, PLS1
 570      for (int k = 0; k < Mf; k ++) {
 571        gsl_matrix_set (u, k, 0, gsl_matrix_get (F, k, 0));
 572      }
 573    } else {
 574      for (int k = 0; k < Mf; k ++) {
 575        gsl_matrix_set (u, k, 0, a68_gauss_rand ());
 576      }
 577    }
 578  // Decompose E and F jointly.
 579    gsl_vector *Sv = gsl_vector_calloc (Nk), *dD = gsl_vector_calloc (Nk);
 580    gsl_matrix *u0 = gsl_matrix_calloc (Mf, 1), *diff = gsl_matrix_calloc (Mf, 1);
 581    gsl_matrix *eigens = NO_REAL_MATRIX;
 582    gsl_matrix *t = NO_REAL_MATRIX, *b = NO_REAL_MATRIX, *c = NO_REAL_MATRIX;
 583    gsl_matrix *pl = NO_REAL_MATRIX, *ql = NO_REAL_MATRIX, *nP = NO_REAL_MATRIX, *nC = NO_REAL_MATRIX;
 584    BOOL_T siga = A68_TRUE;
 585    for (int k = 0; k < Nk && siga; k ++) {
 586      REAL_T norm_e, norm;
 587      for (int j = 0; j < PLS_MAX_ITER; j ++) {
 588        gsl_matrix_memcpy (u0, u);
 589  // Compute X weight.  Note that Eᵀ * u is of form [Indahl]
 590  // Eᵀ * F * Fᵀ * E * w / |Eᵀ * F * Fᵀ * w| = (1 / lambda) (Fᵀ * E)ᵀ * (Fᵀ * E) * w,
 591  // that is, w is an eigenvector of covariance matrix Fᵀ * E and 
 592  // longest eigenvector of symmetric matrix Eᵀ * F * Fᵀ * E.
 593        a68_dgemm (FLIP, SELF, 1.0, E, u, 0.0, &eigens);
 594        norm_e = matrix_norm (eigens);
 595        ASSERT_GSL (gsl_matrix_scale (eigens, 1.0 / norm_e));
 596  // X factor score.
 597        a68_dgemm (SELF, SELF, 1.0, E, eigens, 0.0, &t);
 598        norm = matrix_norm (t);
 599        ASSERT_GSL (gsl_matrix_scale (t, 1.0 / norm));
 600  // Y weight.
 601        a68_dgemm (FLIP, SELF, 1.0, F, t, 0.0, &c);
 602        norm = matrix_norm (c);
 603        ASSERT_GSL (gsl_matrix_scale (c, 1.0 / norm));
 604  // Y score.
 605        a68_dgemm (SELF, SELF, 1.0, F, c, 0.0, &u);
 606        gsl_matrix_memcpy (diff, u);
 607        gsl_matrix_sub (diff, u0);
 608        norm = matrix_norm (diff);
 609        if (norm < SMALL_EIGEN) {
 610          j = PLS_MAX_ITER;
 611        }   
 612      }
 613      if (k > 0 && (norm_e / gsl_vector_get (Sv, 0)) < VALUE (&lim)) {
 614        Nk = k;
 615        siga = A68_FALSE;
 616      } else {
 617        gsl_vector_set (Sv, k, norm_e);
 618  // X factor loading and deflation.
 619        a68_dgemm (FLIP, SELF, 1.0, E, t, 0.0, &pl);
 620        norm = matrix_norm (t);
 621        ASSERT_GSL (gsl_matrix_scale (pl, 1.0 / (norm * norm)));
 622        a68_dgemm (SELF, FLIP, -1.0, t, pl, 1.0, &E);
 623  // Y factor loading and deflation.
 624        a68_dgemm (FLIP, SELF, 1.0, F, u, 0.0, &ql);
 625        norm = matrix_norm (u);
 626        ASSERT_GSL (gsl_matrix_scale (ql, 1.0 / (norm * norm)));
 627        a68_dgemm (FLIP, SELF, 1.0, t, u, 0.0, &b);
 628        a68_dgemm (SELF, FLIP, -gsl_matrix_get (b, 0, 0), t, ql, 1.0, &F);
 629  // Build vector and matrices.
 630        gsl_vector_set (dD, k, gsl_matrix_get (b, 0, 0));
 631        nP = mat_before_ab (p, nP, pl); // P
 632        nC = mat_before_ab (p, nC, c);  // C
 633      }
 634    }
 635  // Compute β = (Pᵀ)⁻¹ * D * Cᵀ
 636  // Python diag function.
 637    gsl_matrix *D = gsl_matrix_calloc (Nk, Nk);
 638    for (int k = 0; k < Nk; k++) {
 639      gsl_matrix_set (D, k, k, gsl_vector_get (dD, k));
 640    }
 641    gsl_matrix *mpinv = NO_REAL_MATRIX;
 642    compute_pseudo_inverse (p, &mpinv, nP, SMALL_EIGEN);
 643    gsl_matrix *z = NO_REAL_MATRIX, *beta = NO_REAL_MATRIX;
 644    a68_dgemm (FLIP, SELF, 1.0, mpinv, D, 0.0, &z);
 645    a68_dgemm (SELF, FLIP, 1.0, z, nC, 0.0, &beta);
 646  // Yield results.
 647    if (!IS_NIL (singulars)) {
 648      gsl_vector *Svl = gsl_vector_calloc (Nk);
 649      for (int k = 0; k < Nk; k++) {
 650        gsl_vector_set (Svl, k, gsl_vector_get (Sv, k));
 651      }
 652      *DEREF (A68_REF, &singulars) = vector_to_row (p, Svl);
 653      a68_vector_free (Svl);
 654    }
 655    push_matrix (p, beta);
 656    (void) gsl_set_error_handler (save_handler);
 657  // Garbage collector.
 658    a68_vector_free (dD);
 659    a68_vector_free (Sv);
 660    a68_matrix_free (b);
 661    a68_matrix_free (beta);
 662    a68_matrix_free (c);
 663    a68_matrix_free (D);
 664    a68_matrix_free (diff);
 665    a68_matrix_free (eigens);
 666    a68_matrix_free (nC);
 667    a68_matrix_free (nP);
 668    a68_matrix_free (mpinv);
 669    a68_matrix_free (pl);
 670    a68_matrix_free (ql);
 671    a68_matrix_free (t);
 672    a68_matrix_free (u);
 673    a68_matrix_free (u0);
 674    a68_matrix_free (z);
 675  #undef PLS_MAX_ITER
 676  }
 677  
 678  //! @brief PROC tls = ([, ] REAL, [, ] REAL, REF [] REAL) [, ] REAL
 679  
 680  void genie_matrix_tls (NODE_T *p)
 681  {
 682  // TLS relates to PLS as OLS relates to PCR.
 683  // TLS is implemented as PLS1 with maximum number of singular values.
 684    PUSH_VALUE (p, 0.0, A68_REAL);
 685    PUSH_VALUE (p, 0, A68_INT);
 686    genie_matrix_pls1 (p);
 687  }
 688  
 689  #endif
     


© 2002-2024 J.M. van der Veer (jmvdveer@xs4all.nl)