From b1ef1121af924a7646cc30721b2329549b21a6e4 Mon Sep 17 00:00:00 2001
From: Giovanni La Mura <giovanni.lamura@inaf.it>
Date: Wed, 6 Mar 2024 16:44:49 +0100
Subject: [PATCH] Implement optimization fall-back for algebraic functions

---
 src/cluster/cluster.cpp      | 20 +++++++++++++++----
 src/include/algebraic.h      | 30 ++++++++++++++++++++++++++++
 src/include/clu_subs.h       |  6 +++---
 src/libnptm/Makefile         |  6 +++---
 src/libnptm/algebraic.cpp    | 30 ++++++++++++++++++++++++++++
 src/libnptm/clu_subs.cpp     | 38 ++++++++++++++++++------------------
 src/libnptm/lapack_calls.cpp |  4 ++--
 7 files changed, 103 insertions(+), 31 deletions(-)
 create mode 100644 src/include/algebraic.h
 create mode 100644 src/libnptm/algebraic.cpp

diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index a31b2b71..a0fc6642 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -37,6 +37,14 @@
 #include "../include/lapack_calls.h"
 #endif
 
+#ifndef INCLUDE_ALGEBRAIC_H_
+#include "../include/algebraic.h"
+#endif
+
+#ifdef LAPACK_ILP64
+#define USE_LAPACK
+#endif
+
 using namespace std;
 
 /*! \brief C++ implementation of CLU
@@ -71,7 +79,7 @@ void cluster(string config_file, string data_file, string output_path) {
   if (sconf->number_of_spheres == gconf->number_of_spheres) {
     // Shortcuts to variables stored in configuration objects
     int nsph = gconf->number_of_spheres;
-    int mxndm = gconf->mxndm;
+    lapack_int mxndm = (lapack_int)gconf->mxndm;
     int inpol = gconf->in_pol;
     int npnt = gconf->npnt;
     int npntts = gconf->npntts;
@@ -273,6 +281,11 @@ void cluster(string config_file, string data_file, string output_path) {
     string tppoan_name = output_path + "/c_TPPOAN";
     tppoan.open(tppoan_name.c_str(), ios::out | ios::binary);
     if (tppoan.is_open()) {
+#ifdef USE_LAPACK
+      printf("INFO: should use LAPACK calls.\n");
+#else
+      printf("INFO: should use fall-back lucin() calls.\n");
+#endif
       tppoan.write(reinterpret_cast<char *>(&iavm), sizeof(int));
       tppoan.write(reinterpret_cast<char *>(&isam), sizeof(int));
       tppoan.write(reinterpret_cast<char *>(&inpol), sizeof(int));
@@ -340,10 +353,9 @@ void cluster(string config_file, string data_file, string output_path) {
 	  if (jer != 0) break;
 	} // i132 loop
 	cms(am, c1, c1ao, c4, c6);
-	//cms(tam, c1, c1ao, c4, c6);
 	lapack_int ndit = 2 * nsph * c4->nlim;
-	//lucin(am, mxndm, ndit, jer);
-	zinvert(am, ndit, jer);
+	invert_matrix(am, ndit, jer, mxndm);
+	//zinvert(am, ndit, jer);
 	if (jer != 0) break; // jxi488 loop: goes to memory clean
 	ztm(am, c1, c1ao, c4, c6, c9);
 	if (sconf->idfc >= 0) {
diff --git a/src/include/algebraic.h b/src/include/algebraic.h
new file mode 100644
index 00000000..8c0cf124
--- /dev/null
+++ b/src/include/algebraic.h
@@ -0,0 +1,30 @@
+/*! \file algebraic.h
+ *
+ * \brief Declaration of algebraic functions with different call-backs.
+ *
+ * In principle, the system that runs NP_TMcode may offer various types of
+ * optimized features, such as multi-core or multi-node scaling, GPU offload,
+ * or external libraries. This header collects a set of functions that can
+ * perform standard algebraic operations choosing the most optimized available
+ * system as a call-back. If no optimization is detected, eventually the 
+ * legacy serial function implementation is used as a fall-back.
+ */
+
+#ifndef lapack_int
+#define lapack_int int64_t
+#endif
+
+#ifndef INCLUDE_ALGEBRAIC_H_
+#define INCLUDE_ALGEBRAIC_H_
+
+/*! \brief Perform in-place matrix inversion.
+ *
+ * \param mat: Matrix of complex. The matrix to be inverted (must be a square matrix).
+ * \param size: `lapack_int` The size of the matrix (i.e. the number of its rows or columns).
+ * \param ier: `int &` Reference to an integer variable for returning a result flag.
+ * \param max_size: `lapack_int` The maximum expected size (required by some call-backs,
+ * optional, defaults to 0).
+ */
+void invert_matrix(std::complex<double> **mat, lapack_int size, int &ier, lapack_int max_size=0);
+
+#endif
diff --git a/src/include/clu_subs.h b/src/include/clu_subs.h
index 49b89e20..6420e619 100644
--- a/src/include/clu_subs.h
+++ b/src/include/clu_subs.h
@@ -154,11 +154,11 @@ void hjv(
  * LU decomposition. See Eq. (5.29) in Borghese, Denti & Saija (2007).
  *
  * \param am: Matrix of complex.
- * \param nddmst: `const int`
- * \param n: `int`
+ * \param nddmst: `const int64_t`
+ * \param n: `int64_t`
  * \param ier: `int &`
  */
-void lucin(std::complex<double> **am, const int nddmst, int n, int &ier);
+void lucin(std::complex<double> **am, const int64_t nddmst, int64_t n, int &ier);
 
 /*! \brief Compute the average extinction cross-section.
  *
diff --git a/src/libnptm/Makefile b/src/libnptm/Makefile
index 7fedc1fb..61e447c4 100644
--- a/src/libnptm/Makefile
+++ b/src/libnptm/Makefile
@@ -19,11 +19,11 @@ endif
 include ../make.inc
 
 
-CXX_NPTM_OBJS=$(OBJDIR)/Commons.o $(OBJDIR)/Configuration.o $(OBJDIR)/file_io.o $(OBJDIR)/Parsers.o $(OBJDIR)/sph_subs.o $(OBJDIR)/clu_subs.o $(OBJDIR)/tfrfme.o $(OBJDIR)/tra_subs.o $(OBJDIR)/TransitionMatrix.o $(OBJDIR)/lapack_calls.o
+CXX_NPTM_OBJS=$(OBJDIR)/Commons.o $(OBJDIR)/Configuration.o $(OBJDIR)/file_io.o $(OBJDIR)/Parsers.o $(OBJDIR)/sph_subs.o $(OBJDIR)/clu_subs.o $(OBJDIR)/tfrfme.o $(OBJDIR)/tra_subs.o $(OBJDIR)/TransitionMatrix.o $(OBJDIR)/lapack_calls.o $(OBJDIR)/algebraic.o
 
-CXX_NPTM_DYNOBJS=$(DYNOBJDIR)/Commons.o $(DYNOBJDIR)/Configuration.o $(DYNOBJDIR)/file_io.o $(DYNOBJDIR)/Parsers.o $(DYNOBJDIR)/sph_subs.o $(DYNOBJDIR)/clu_subs.o $(DYNOBJDIR)/tfrfme.o $(DYNOBJDIR)/tra_subs.o $(DYNOBJDIR)/TransitionMatrix.o $(DYNOBJDIR)/lapack_calls.o
+CXX_NPTM_DYNOBJS=$(DYNOBJDIR)/Commons.o $(DYNOBJDIR)/Configuration.o $(DYNOBJDIR)/file_io.o $(DYNOBJDIR)/Parsers.o $(DYNOBJDIR)/sph_subs.o $(DYNOBJDIR)/clu_subs.o $(DYNOBJDIR)/tfrfme.o $(DYNOBJDIR)/tra_subs.o $(DYNOBJDIR)/TransitionMatrix.o $(DYNOBJDIR)/lapack_calls.o $(DYNOBJDIR)/algebraic.o
 
-CXX_NPTM_DEBUG=$(OBJDIR)/Commons.g* $(OBJDIR)/Configuration.g* $(OBJDIR)/file_io.g* $(OBJDIR)/Parsers.g* $(OBJDIR)/sph_subs.g* $(OBJDIR)/clu_subs.g* $(OBJDIR)/tfrfme.g* $(OBJDIR)/tra_subs.g* $(OBJDIR)/TransitionMatrix.g* $(OBJDIR)/lapack_calls.g*
+CXX_NPTM_DEBUG=$(OBJDIR)/Commons.g* $(OBJDIR)/Configuration.g* $(OBJDIR)/file_io.g* $(OBJDIR)/Parsers.g* $(OBJDIR)/sph_subs.g* $(OBJDIR)/clu_subs.g* $(OBJDIR)/tfrfme.g* $(OBJDIR)/tra_subs.g* $(OBJDIR)/TransitionMatrix.g* $(OBJDIR)/lapack_calls.g* $(OBJDIR)/algebraic.g*
 
 all: $(BUILDDIR_NPTM)/libnptm.a $(BUILDDIR_NPTM)/libnptm.so
 
diff --git a/src/libnptm/algebraic.cpp b/src/libnptm/algebraic.cpp
new file mode 100644
index 00000000..a1c42031
--- /dev/null
+++ b/src/libnptm/algebraic.cpp
@@ -0,0 +1,30 @@
+/*! \file algebraic.cpp
+ *
+ * \brief Implementation of algebraic functions with different call-backs.
+ */
+
+#include <complex>
+
+#ifndef INCLUDE_ALGEBRAIC_H_
+#include "../include/algebraic.h"
+#endif
+
+#ifndef INCLUDE_LAPACK_CALLS_H_
+#include "lapacke.h"
+#include "../include/lapack_calls.h"
+#endif
+
+// >>> FALL-BACK FUNCTIONS DECLARATION <<< //
+extern void lucin(std::complex<double> **mat, int64_t max_size, int64_t size, int &ier);
+// >>>   END OF FALL-BACK FUNCTIONS    <<< //
+
+using namespace std;
+
+void invert_matrix(std::complex<double> **mat, lapack_int size, int &ier, lapack_int max_size) {
+  ier = 0;
+#ifdef USE_LAPACK
+  zinvert(mat, size, ier);
+#else
+  lucin(mat, (int64_t)max_size, (int64_t)size, ier);
+#endif
+}
diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index 22287bb7..48f087b9 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -892,7 +892,7 @@ void hjv(
   delete[] rfn;
 }
 
-void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
+void lucin(std::complex<double> **am, const int64_t nddmst, int64_t n, int &ier) {
   /* NDDMST  FIRST DIMENSION OF AM AS DECLARED IN DIMENSION
    *         STATEMENT.
    * N       NUMBER OF ROWS IN AM.
@@ -903,9 +903,9 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
   complex<double> cc0 = complex<double>(0.0, 0.0);
   ier = 0;
   int nminus = n - 1;
-  for (int i = 1; i <= n; i++) {
+  for (int64_t i = 1; i <= n; i++) {
     double sum = 0.0;
-    for (int j = 1; j <= n; j++) {
+    for (int64_t j = 1; j <= n; j++) {
       sum += (
 	      am[i - 1][j - 1].real() * am[i - 1][j - 1].real()
 	      + am[i - 1][j - 1].imag() * am[i - 1][j - 1].imag()
@@ -918,12 +918,12 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
   //    (ROW INTERCHANGES TAKE PLACE, AND THE INDICES OF THE PIVOTAL ROWS
   //    ARE PLACED IN V.)
   /* >>> THERE APPEARS TO BE A BUG IN THE FOLLOWING LOOP <<< */
-  for (int k = 1; k <= n; k++) {
-    int kplus = k + 1;
-    int kminus = k - 1;
-    int l = k;
+  for (int64_t k = 1; k <= n; k++) {
+    int64_t kplus = k + 1;
+    int64_t kminus = k - 1;
+    int64_t l = k;
     double psqmax = 0.0;
-    for (int i = k; i <= n; i++) {
+    for (int64_t i = k; i <= n; i++) {
       cfun = cdtp(-am[i - 1][k - 1], am, i, 1, k, kminus);
       ctemp = -cfun;
       am[i - 1][k - 1] = ctemp;
@@ -934,7 +934,7 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
       }
     } // i2029 loop
     if (l != k) {
-      for (int j = 1; j <= n; j++) {
+      for (int64_t j = 1; j <= n; j++) {
 	ctemp = am[k - 1][j - 1];
 	am[k - 1][j - 1] = am[l - 1][j - 1];
 	am[l - 1][j - 1] = ctemp;
@@ -951,7 +951,7 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
     ctemp = 1.0 / am[k - 1][k - 1];
     am[k - 1][k - 1] = ctemp;
     if (kplus <= n) {
-      for (int j = kplus; j <= n; j++) {
+      for (int64_t j = kplus; j <= n; j++) {
 	cfun = cdtp(-am[k - 1][j - 1], am, k, 1, j, kminus);
 	am[k - 1][j - 1] = -ctemp * cfun;
       } // j2059 loop
@@ -959,9 +959,9 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
   } // k2019 loop
   // 4.  REPLACE AM BY ITS INVERSE AMINV.
   // 4.1 REPLACE L AND U BY THEIR INVERSE LINV AND UINV.
-  for (int k = 1; k <= nminus; k++) {
-    int kplus = k + 1;
-    for (int i = kplus; i <= n; i++) {
+  for (int64_t k = 1; k <= nminus; k++) {
+    int64_t kplus = k + 1;
+    for (int64_t i = kplus; i <= n; i++) {
       cfun = cdtp(cc0, am, i, k, k, i - k);
       am[i - 1][k - 1] = -am[i - 1][i - 1] * cfun;
       cfun = cdtp(am[k - 1][i - 1], am, k, kplus, i, i - k - 1);
@@ -969,8 +969,8 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
     } // i4119 loop
   } // k4109 loop
   // 4.2 FORM AMINV=UINV*LINV.
-  for (int k = 1; k <= n; k++) {
-    for (int i = 1; i <= n; i++) {
+  for (int64_t k = 1; k <= n; k++) {
+    for (int64_t i = 1; i <= n; i++) {
       if (i < k) {
 	cfun = cdtp(cc0, am, i, k, k, n - k + 1);
 	am[i - 1][k -1] = cfun;
@@ -983,11 +983,11 @@ void lucin(std::complex<double> **am, const int nddmst, int n, int &ier) {
   } // k4209 loop
   // 4.3 INTERCHANGE COLUMNS OF AMINV AS SPECIFIED BY V, BUT IN REVERSE
   //     ORDER.
-  for (int l = 1; l <= n; l++) {
-    int k = n - l + 1;
-    int kcol = (int)(v[k - 1]);
+  for (int64_t l = 1; l <= n; l++) {
+    int64_t k = n - l + 1;
+    int64_t kcol = (int64_t)(v[k - 1]);
     if (kcol != k) {
-      for (int i = 1; i <= n; i++) {
+      for (int64_t i = 1; i <= n; i++) {
 	ctemp = am[i - 1][k - 1];
 	am[i - 1][k - 1] = am[i - 1][kcol - 1];
 	am[i - 1][kcol - 1] = ctemp;
diff --git a/src/libnptm/lapack_calls.cpp b/src/libnptm/lapack_calls.cpp
index 0b43c046..462bdaec 100644
--- a/src/libnptm/lapack_calls.cpp
+++ b/src/libnptm/lapack_calls.cpp
@@ -27,6 +27,6 @@ void zinvert(std::complex<double> **mat, lapack_int n, int &jer) {
       mat[j][i] = std::complex<double>(creal(arr[idx]), cimag(arr[idx]));
     }
   }
-  delete [] IPIV;
-  delete [] arr;
+  delete[] IPIV;
+  delete[] arr;
 }
-- 
GitLab