From 50efe30f48899e819f34a7c66efd904c99a4154f Mon Sep 17 00:00:00 2001
From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it>
Date: Thu, 5 Dec 2024 16:34:34 +0100
Subject: [PATCH] first implementation of CUBLAS-based matrix inversion

---
 build/configure.ac           |  63 ++++++++++
 src/cluster/cluster.cpp      |  36 +++++-
 src/include/cublas_calls.h   |  50 ++++++++
 src/libnptm/algebraic.cpp    |  18 ++-
 src/libnptm/cublas_calls.cpp | 232 +++++++++++++++++++++++++++++++++++
 src/libnptm/lapack_calls.cpp |  16 +--
 6 files changed, 401 insertions(+), 14 deletions(-)
 create mode 100644 src/include/cublas_calls.h
 create mode 100644 src/libnptm/cublas_calls.cpp
diff --git a/build/configure.ac b/build/configure.ac
index fd029deb..e57581d2 100644
--- a/build/configure.ac
+++ b/build/configure.ac
@@ -163,6 +163,69 @@ m4_define(
   ]
 )
 
+m4_define(
+  [M4_DETECT_CUBLAS],
+  [
+    pkg-config --version > /dev/null
+    use_pkg_config=$?
+    if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then
+      if test "x$use_pkg_config" = "x0"; then
+        # pkg-config is available
+        declare -a pkg_array=$(pkg-config --list-all | grep cublas)
+        for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas > /dev/null
+        result=$?
+        if test "x$result" = "x0"; then
+          # CUBLAS detected
+          cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas)
+    	CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
+    	CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
+        fi # end of CUBLAS runtime decision tree
+        declare -a pkg_array=$(pkg-config --list-all | grep cudart)
+        for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart > /dev/null
+        result=$?
+        if test "x$result" = "x0"; then
+          # CUDA runtime detected
+          cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart)
+    	CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg})
+    	CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg})
+        fi # end of CUDA runtime decision tree
+        echo $CUDALDFLAGS | grep cublas > /dev/null
+        cudart_check=$?
+        if test "x${cudart_check}" != "x0"; then
+          CUDALDFLAGS="$CUDALDFLAGS -lcublas"
+        fi
+        echo $CUDALDFLAGS | grep cudart > /dev/null
+        cudart_check=$?
+        if test "x${cudart_check}" != "x0"; then
+          CUDALDFLAGS="$CUDALDFLAGS -lcudart"
+        fi
+      else
+        # pkg-config is not available
+        if test -f /usr/local/cuda/include/cuda.h; then
+          CUDAFLAGS="-I/usr/local/cuda/include"
+          CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart"
+        elif test -f /usr/include/cuda.h; then
+          CUDAFLAGS="-I/usr/include"
+          CUDALDFLAGS="-lcublas -lcudart"
+        elif test "x$CUDA_HOME" != "x"; then
+          CUDAFLAGS="-I${CUDA_HOME}/include"
+          CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart"
+        fi
+      fi # end of pkg-config decision tree
+    fi # end of CUDAFLAGS user override protection
+    if test "x$CUDAFLAGS" != "x" then
+      # somehow CUDAFLAGS was defined
+      export CUDAFLAGS
+      export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}"
+    fi
+    if test "x$CUDALDFLAGS" != "x" then
+      # somehow CUDALDFLAGS was defined
+      export CUDALDFLAGS
+      export CUBLASLDFLAGS="${CUDALDFLAGS}"
+    fi
+  ]
+)
+
 m4_define(
   [M4_DETECT_MAGMA],
   [
diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index f78465a7..0fd0874f 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -24,17 +24,26 @@
 #include <fstream>
 #include <hdf5.h>
 #include <string>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+
 #ifdef USE_MPI
 #ifndef MPI_VERSION
 #include <mpi.h>
 #endif
 #endif
+
 #ifdef USE_NVTX
 #include <nvtx3/nvToolsExt.h>
 #endif
+
+#define USE_CUBLAS 1
+#ifdef USE_CUBLAS
+#include <cuda_runtime.h>
+#endif
+
 #ifdef USE_MAGMA
 #include <cuda_runtime.h>
 #endif
@@ -117,10 +126,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
   Logger *logger = new Logger(LOG_DEBG);
   int device_count = 0;
 
+#ifdef USE_CUBLAS
+  cudaGetDeviceCount(&device_count);
+  logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
+#elif defined USE_MAGMA
   //===========
   // Initialise MAGMA
   //===========
-#ifdef USE_MAGMA
   // GMu note: MAGMA does not necessarily rely on CUDA, it may just as well run on openCL or HIP, we should consider alternative ways to detect the number of devices if MAGMA is not using CUDA but something else
   cudaGetDeviceCount(&device_count);
   logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
@@ -136,8 +148,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
     delete logger;
     return;
   }
-#endif
   // end MAGMA initialisation
+#endif
 
   //===========================
   // the following only happens on MPI process 0
@@ -279,7 +291,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
       // Create empty virtual binary file
       VirtualBinaryFile *vtppoanp = new VirtualBinaryFile();
       string tppoan_name = output_path + "/c_TPPOAN";
-#ifdef USE_MAGMA
+#ifdef USE_CUBLAS
+      logger->log("INFO: using CUBLAS calls.\n", LOG_INFO);
+#elif defined USE_MAGMA
       logger->log("INFO: using MAGMA calls.\n", LOG_INFO);
 #elif defined USE_LAPACK
       logger->log("INFO: using LAPACK calls.\n", LOG_INFO);
@@ -549,7 +563,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
       
     delete sconf;
     delete gconf;
-#ifdef USE_MAGMA
+#ifdef USE_CUBLAS
+    // just a placeholder to skip magma finalisation if we are using cublas
+#elif defined USE_MAGMA
     logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
     magma_finalize();
 #endif
@@ -672,7 +688,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
     delete sconf;
     delete gconf;
 #endif
-#ifdef USE_MAGMA
+#ifdef USE_CUBLAS
+    // placeholder to avoid magma if using cublas
+#elif defined USE_MAGMA
     logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
     magma_finalize();
 #endif
@@ -790,6 +808,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   string outam0_name = output_path + "/c_AM0_JXI" + to_string(jxi488) + ".txt";
   sprintf(virtual_line, " AM matrix before CMS\n");
   outam0->append_line(virtual_line);
+  sprintf(virtual_line, " %d\n", ndit);
+  outam0->append_line(virtual_line);  
   sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
   outam0->append_line(virtual_line);
   write_dcomplex_matrix(outam0, cid->am, ndit, ndit);
@@ -802,6 +822,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
   sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
   outam1->append_line(virtual_line);
+  sprintf(virtual_line, " %d\n", ndit);
+  outam1->append_line(virtual_line);  
   sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
   outam1->append_line(virtual_line);
   write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
@@ -838,6 +860,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   string outam2_name = output_path + "/c_AM2_JXI" + to_string(jxi488) + ".txt";
   sprintf(virtual_line, " AM matrix after LUCIN before ZTM\n");
   outam2->append_line(virtual_line);
+  sprintf(virtual_line, " %d\n", ndit);
+  outam2->append_line(virtual_line);  
   sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
   outam2->append_line(virtual_line);
   write_dcomplex_matrix(outam2, cid->am, ndit, ndit);
@@ -867,6 +891,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt";
   sprintf(virtual_line, " AM matrix after ZTM\n");
   outam3->append_line(virtual_line);
+  sprintf(virtual_line, " %d\n", ndit);
+  outam3->append_line(virtual_line);  
   sprintf(virtual_line, " I1+1   I2+1    Real    Imag\n");
   outam3->append_line(virtual_line);
   write_dcomplex_matrix(outam3, cid->am, ndit, ndit);
diff --git a/src/include/cublas_calls.h b/src/include/cublas_calls.h
new file mode 100644
index 00000000..e71b7558
--- /dev/null
+++ b/src/include/cublas_calls.h
@@ -0,0 +1,50 @@
+/* Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   A copy of the GNU General Public License is distributed along with
+   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
+ */
+
+/*! \file cublas_calls.h
+ *
+ * \brief C++ interface to CUBLAS calls.
+ *
+ */
+
+#ifndef INCLUDE_CUBLAS_CALLS_H_
+#define INCLUDE_CUBLAS_CALLS_H_
+
+/*! \brief Invert a complex matrix with double precision elements.
+ *
+ * Use CUBLAS to perform an in-place matrix inversion for a complex
+ * matrix with double precision elements.
+ *
+ * \param mat: Matrix of complex. The matrix to be inverted.
+ * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
+ * \param device_id: `int` ID of the device for matrix inversion offloading.
+ */
+void cublas_zinvert(dcomplex **mat, np_int n, int device_id);
+
+/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution
+ *
+ * Use CUBLAS to perform matrix inversion for a complex
+ * matrix with double precision elements.
+ *
+ * \param mat: Matrix of complex. The matrix to be inverted.
+ * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
+ * \param maxrefiters: `int` Maximum number of refinement iterations to apply.
+ * \param accuracygoal: `double` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy
+ * \param device_id: `int` ID of the device for matrix inversion offloading.
+ */
+void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id);
+
+#endif
diff --git a/src/libnptm/algebraic.cpp b/src/libnptm/algebraic.cpp
index 4fb5f117..4e47c7dc 100644
--- a/src/libnptm/algebraic.cpp
+++ b/src/libnptm/algebraic.cpp
@@ -38,6 +38,16 @@
 #endif
 #endif
 
+// define by hand for a first test
+#define USE_CUBLAS 1
+#ifdef USE_CUBLAS
+// define by hand for a first test
+//#define USE_REFINEMENT 1
+#ifndef INCLUDE_CUBLAS_CALLS_H_
+#include "../include/cublas_calls.h"
+#endif
+#endif
+
 #ifndef INCLUDE_ALGEBRAIC_H_
 #include "../include/algebraic.h"
 #endif
@@ -50,7 +60,13 @@ using namespace std;
 
 void invert_matrix(dcomplex **mat, np_int size, int &ier, int &maxrefiters, double &accuracygoal, int refinemode, np_int max_size, int target_device) {
   ier = 0;
-#ifdef USE_MAGMA
+#ifdef USE_CUBLAS
+#ifdef USE_REFINEMENT
+  cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device);
+#else
+  cublas_zinvert(mat, size, target_device);
+#endif
+#elif defined USE_MAGMA
 #ifdef USE_REFINEMENT
   // try using the iterative refinement to obtain a more accurate solution
   // we pass to magma_zinvert_and_refine() the accuracygoal in, get the actual
diff --git a/src/libnptm/cublas_calls.cpp b/src/libnptm/cublas_calls.cpp
new file mode 100644
index 00000000..ad191961
--- /dev/null
+++ b/src/libnptm/cublas_calls.cpp
@@ -0,0 +1,232 @@
+/* Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   A copy of the GNU General Public License is distributed along with
+   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
+ */
+
+/*! \file cublas_calls.cpp
+ *
+ * \brief Implementation of the interface with CUBLAS libraries.
+ */
+#ifndef INCLUDE_TYPES_H_
+#include "../include/types.h"
+#endif
+
+#define USE_CUBLAS 1
+#ifdef USE_CUBLAS
+
+#ifndef INCLUDE_CUBLAS_CALLS_H_
+#include "../include/cublas_calls.h"
+#endif
+
+#include <limits>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+#ifdef USE_ILP64
+#define CUZGEMM cublasZgemm_64
+#define CUZAXPY cublasZaxpy_64
+#define CUIZAMAX cublasIzamax_64
+#else
+#define CUZGEMM cublasZgemm
+#define CUZAXPY cublasZaxpy
+#define CUIZAMAX cublasIzamax
+#endif
+
+#define cudacall(call)							\
+  do									\
+    {									\
+      cudaError_t err = (call);						\
+      if(cudaSuccess != err)						\
+        {								\
+	  fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+	  cudaDeviceReset();						\
+	  exit(EXIT_FAILURE);						\
+        }								\
+    }									\
+  while (0)
+
+#define cublascall(call)						\
+  do									\
+    {									\
+      cublasStatus_t status = (call);					\
+      if(CUBLAS_STATUS_SUCCESS != status)				\
+        {								\
+	  fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
+	  cudaDeviceReset();						\
+	  exit(EXIT_FAILURE);						\
+        }								\
+      									\
+    }									\
+  while(0)
+
+void cublas_zinvert(dcomplex **mat, np_int n, int device_id) {
+  cudacall(cudaSetDevice(device_id));
+  cublasHandle_t handle;
+  cublascall(cublasCreate_v2(&handle));
+  int batchsize = 1;
+  int *piv, *info; // array of pivot indices
+  np_int m = (np_int) n; // changed rows; a - mxm matrix
+  np_int mm = m * m; // size of a
+  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
+  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
+  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
+  cuDoubleComplex *d_a; // pointer to first element on device
+  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
+  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
+  cuDoubleComplex **batch_d_a;
+  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
+  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
+  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
+  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
+  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
+  cuDoubleComplex **batch_d_c;
+  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
+  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
+  cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
+  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
+  cudaFree(batch_d_a);
+  cudaFree(batch_d_c);
+  cudaFree(piv);
+  cudaFree(info);
+  cudaFree(d_a);
+  cudaFree(d_c);
+  cublasDestroy_v2(handle);
+
+}
+
+void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) {
+  cudacall(cudaSetDevice(device_id));
+  cublasHandle_t handle;
+  cublascall(cublasCreate_v2(&handle));
+  int batchsize = 1;
+  int *piv, *info; // array of pivot indices
+  np_int m = (np_int) n; // changed rows; a - mxm matrix
+  np_int mm = m * m; // size of a
+  cudacall(cudaMalloc<int>(&piv, m*batchsize*sizeof(int)));
+  cudacall(cudaMalloc<int>(&info, batchsize*sizeof(int)));
+  cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host
+  cuDoubleComplex *d_a; // pointer to first element on device
+  cudacall(cudaMalloc<cuDoubleComplex>(&d_a,m*m*sizeof(cuDoubleComplex)));
+  cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice));
+  cuDoubleComplex **batch_d_a;
+  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_a,batchsize*sizeof(cuDoubleComplex*)));
+  cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
+  cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize));
+  cuDoubleComplex *d_c; // this will contain the inverted matrix on the device
+  cudacall(cudaMalloc<cuDoubleComplex>(&d_c,m*m*sizeof(cuDoubleComplex)));
+  cuDoubleComplex **batch_d_c;
+  cudacall(cudaMalloc<cuDoubleComplex*>(&batch_d_c,batchsize*sizeof(cuDoubleComplex*)));
+  cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice));
+  cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize));
+  //cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
+  cudaFree(batch_d_a);
+  cudaFree(batch_d_c);
+  cudaFree(piv);
+  cudaFree(info);
+  cuDoubleComplex *d_a_residual;
+  cuDoubleComplex *d_a_refine;
+  cuDoubleComplex *d_id;
+  if (maxiters>0) {
+    // copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one
+    cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use  
+    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
+    cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, m*m*sizeof(cuDoubleComplex)));
+    // allocate memory for the temporary matrix products
+    dcomplex *native_id = new dcomplex[m];
+    for (np_int i=0; i<m; i++) native_id[i] = 1;
+    cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]);
+    // fill it with 1
+    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
+    cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
+    delete[] native_id; // free identity vector on host
+  }
+  bool iteraterefine = true;
+  if (maxiters>0) {
+    cuDoubleComplex cu_mone;
+    (((double *) &(cu_mone))[0]) = -1;
+    (((double *) &(cu_mone))[1]) = 0;
+    cuDoubleComplex cu_one;
+    (((double *) &(cu_one))[0]) = 1;
+    (((double *) &(cu_one))[1]) = 0;
+    cuDoubleComplex cu_zero;
+    (((double *) &(cu_zero))[0]) = 0;
+    (((double *) &(cu_zero))[1]) = 0;
+    // multiply minus the original matrix times the inverse matrix
+    // NOTE: factors in zgemm are swapped because zgemm is designed for column-major
+    // Fortran-style arrays, whereas our arrays are C-style row-major.
+    cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
+    // add the identity to the product
+    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
+    double oldmax=0;
+    if (refinemode >0) {
+      np_int maxindex;
+      // find the maximum absolute value of the residual
+      cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
+      cuDoubleComplex cublasmax;
+      // transfer the maximum value to the host
+      cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
+      // take the module
+      oldmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
+      printf("Initial max residue = %g\n", oldmax);
+      if (oldmax < accuracygoal) iteraterefine = false;
+    }
+    // begin correction loop (should iterate maxiters times)
+    int iter;
+    for (iter=0; (iter<maxiters) && iteraterefine; iter++) {
+      // multiply the inverse times the residual, add to the initial inverse
+      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_one, d_a_residual, m, d_c, m, &cu_zero, d_a_refine, m));
+      // add to the initial inverse
+      cublascall(CUZAXPY(handle, mm, &cu_one, d_a_refine, 1, d_c, 1));
+      // multiply minus the original matrix times the new inverse matrix
+      cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
+      // add the identity to the product
+      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
+      if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
+	// find the maximum absolute value of the residual
+	np_int maxindex;
+	cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex));
+	// transfer the maximum value to the host
+	cuDoubleComplex cublasmax;
+	cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));
+	// take the module
+	double newmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1]));
+	printf("Max residue after %d iterations = %g\n", iter+1, newmax);
+	// if the maximum in the residual decreased from the previous iteration,
+	// update oldmax and go on, otherwise no point further iterating refinements
+	if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;
+	oldmax = newmax;
+      }
+    }
+    // if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy
+    if (refinemode==2) maxiters = iter;
+    accuracygoal = oldmax;
+    // end correction loop
+  }
+  // copy the final refined inverted matrix back from device to host
+  cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost));
+  // free temporary device arrays 
+  cudaFree(d_a);
+  cudaFree(d_c);
+  if (maxiters>0) {
+    cudaFree(d_id);
+    cudaFree(d_a_refine);
+    cudaFree(d_a_residual);
+  }
+
+  cublasDestroy_v2(handle);
+
+}
+
+
+#endif
diff --git a/src/libnptm/lapack_calls.cpp b/src/libnptm/lapack_calls.cpp
index 9948e816..ece8df60 100644
--- a/src/libnptm/lapack_calls.cpp
+++ b/src/libnptm/lapack_calls.cpp
@@ -94,14 +94,14 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
   np_int incx = 1;
 #ifdef USE_MKL
   MKL_Complex16 *arr_orig = NULL;
-    MKL_Complex16 *arr_residual = NULL;
-    MKL_Complex16 *arr_refine = NULL;
-    MKL_Complex16 *id = NULL;
+  MKL_Complex16 *arr_residual = NULL;
+  MKL_Complex16 *arr_refine = NULL;
+  MKL_Complex16 *id = NULL;
 #else
-    dcomplex *arr_orig = NULL;
-    dcomplex *arr_residual = NULL;
-    dcomplex *arr_refine = NULL;
-    dcomplex *id = NULL;
+  dcomplex *arr_orig = NULL;
+  dcomplex *arr_residual = NULL;
+  dcomplex *arr_refine = NULL;
+  dcomplex *id = NULL;
 #endif
   if (maxiters>0) {
 #ifdef USE_MKL
@@ -122,7 +122,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
 #endif
     zcopy_(&nn, arr, &incx, arr_orig, &incx);
   }
-  const dcomplex uim = 0.0 + 1.0 * I;
+  // const dcomplex uim = 0.0 + 1.0 * I;
   
   np_int* IPIV = new np_int[n]();
   
-- 
GitLab