From 35cd635ff50abcafe87c95040336157abea92daf Mon Sep 17 00:00:00 2001
From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it>
Date: Sat, 7 Dec 2024 11:38:50 +0100
Subject: [PATCH] use less memory for identity subtraction in iterative
 refinement

---
 src/libnptm/cublas_calls.cpp | 12 ++++++------
 src/libnptm/lapack_calls.cpp | 17 ++++++++---------
 src/libnptm/magma_calls.cpp  | 12 ++++++------
 3 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/src/libnptm/cublas_calls.cpp b/src/libnptm/cublas_calls.cpp
index 754ee46b..caf483a6 100644
--- a/src/libnptm/cublas_calls.cpp
+++ b/src/libnptm/cublas_calls.cpp
@@ -143,12 +143,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
     cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, m*m*sizeof(cuDoubleComplex)));
     cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, m*m*sizeof(cuDoubleComplex)));
     // allocate memory for the temporary matrix products
-    dcomplex *native_id = new dcomplex[m];
-    for (np_int i=0; i<m; i++) native_id[i] = 1;
+    dcomplex *native_id = new dcomplex[1];
+    native_id[0] = 1;
     cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]);
     // fill it with 1
-    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
-    cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
+    cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
+    cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
     delete[] native_id; // free identity vector on host
   }
   bool iteraterefine = true;
@@ -167,7 +167,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
     // Fortran-style arrays, whereas our arrays are C-style row-major.
     cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
     // add the identity to the product
-    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
+    cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
     double oldmax=0;
     if (refinemode >0) {
       np_int maxindex;
@@ -191,7 +191,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
       // multiply minus the original matrix times the new inverse matrix
       cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
       // add the identity to the product
-      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
+      cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
       if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
 	// find the maximum absolute value of the residual
 	np_int maxindex;
diff --git a/src/libnptm/lapack_calls.cpp b/src/libnptm/lapack_calls.cpp
index a23036ce..0b9af94e 100644
--- a/src/libnptm/lapack_calls.cpp
+++ b/src/libnptm/lapack_calls.cpp
@@ -92,6 +92,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
 #endif
   np_int nn = n*n;
   np_int incx = 1;
+  np_int incx0 = 0;
 #ifdef USE_MKL
   MKL_Complex16 *arr_orig = NULL;
   MKL_Complex16 *arr_residual = NULL;
@@ -108,17 +109,15 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
     arr_orig = new MKL_Complex16[nn];
     arr_residual = new MKL_Complex16[nn];
     arr_refine = new MKL_Complex16[nn];
-    id = new MKL_Complex16[n];
-    for (np_int i=0; i<n ; i++) {
-      id[i].real =  1;
-      id[i].imag =  0;
-    }
+    id = new MKL_Complex16[1];
+    id[0].real =  1;
+    id[0].imag =  0;
 #else
     arr_orig = new dcomplex[nn];
     arr_residual = new dcomplex[nn];
     arr_refine = new dcomplex[nn];
-    id = new dcomplex[n];
-    for (np_int i=0; i<n ; i++) id[i] = (dcomplex) 1;
+    id = new dcomplex[1];
+    id[0] = (dcomplex) 1;
 #endif
     zcopy_(&nn, arr, &incx, arr_orig, &incx);
   }
@@ -153,7 +152,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
     // Fortran-style arrays, whereas our arrays are C-style row-major.
     zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
     np_int incy = n+1;
-    zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
+    zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
     double oldmax = 0;
     if (refinemode >0) {
       np_int maxindex = izamax_(&nn, arr_residual, &incx);
@@ -171,7 +170,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
       zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx);
 	// zcopy_(&nn, arr_refine, &incx, arr, &incx);
       zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
-      zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
+      zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
       if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
 	np_int maxindex = izamax_(&nn, arr_residual, &incx);
 #ifdef USE_MKL
diff --git a/src/libnptm/magma_calls.cpp b/src/libnptm/magma_calls.cpp
index 593374c1..514af4e4 100644
--- a/src/libnptm/magma_calls.cpp
+++ b/src/libnptm/magma_calls.cpp
@@ -132,16 +132,16 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
     }
     // allocate memory for the identity vector on the host
     {
-      dcomplex *native_id = new dcomplex[m];
-      for (magma_int_t i=0; i<m; i++) native_id[i] = 1;
+      dcomplex *native_id = new dcomplex[1];
+      native_id[0] = 1;
       magmaDoubleComplex *id = (magmaDoubleComplex *) &(native_id[0]);
       // fill it with 1
-      err = magma_zmalloc(&d_id, m);
+      err = magma_zmalloc(&d_id, 1);
       if (err != MAGMA_SUCCESS) {
 	printf("Error allocating d_id\n");
 	exit(1);
       }
-      magma_zsetvector(m, id, 1, d_id, 1, queue); // copy identity to device vector
+      magma_zsetvector(1, id, 1, d_id, 1, queue); // copy identity to device vector
       delete[] native_id; // free identity vector on host
     }
   }
@@ -161,7 +161,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
     // Fortran-style arrays, whereas our arrays are C-style row-major.
     magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m,  magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
     // add the identity to the product
-    magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
+    magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
     double oldmax=0;
     if (refinemode >0) {
       // find the maximum absolute value of the residual
@@ -184,7 +184,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
       // multiply minus the original matrix times the new inverse matrix
       magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
       // add the identity to the product
-      magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
+      magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
       if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) {
 	// find the maximum absolute value of the residual
 	magma_int_t maxindex = magma_izamax(mm, d_a_residual, 1, queue);
-- 
GitLab