From 35cd635ff50abcafe87c95040336157abea92daf Mon Sep 17 00:00:00 2001 From: "Mulas, Giacomo" Date: Sat, 7 Dec 2024 11:38:50 +0100 Subject: [PATCH] use less memory for identity subtraction in iterative refinement --- src/libnptm/cublas_calls.cpp | 12 ++++++------ src/libnptm/lapack_calls.cpp | 17 ++++++++--------- src/libnptm/magma_calls.cpp | 12 ++++++------ 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/libnptm/cublas_calls.cpp b/src/libnptm/cublas_calls.cpp index 754ee46b..caf483a6 100644 --- a/src/libnptm/cublas_calls.cpp +++ b/src/libnptm/cublas_calls.cpp @@ -143,12 +143,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double & cudacall(cudaMalloc(&d_a_residual, m*m*sizeof(cuDoubleComplex))); cudacall(cudaMalloc(&d_a_refine, m*m*sizeof(cuDoubleComplex))); // allocate memory for the temporary matrix products - dcomplex *native_id = new dcomplex[m]; - for (np_int i=0; i(&d_id, m*sizeof(cuDoubleComplex))); - cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector + cudacall(cudaMalloc(&d_id, sizeof(cuDoubleComplex))); + cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector delete[] native_id; // free identity vector on host } bool iteraterefine = true; @@ -167,7 +167,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double & // Fortran-style arrays, whereas our arrays are C-style row-major. cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m)); // add the identity to the product - cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1)); + cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1)); double oldmax=0; if (refinemode >0) { np_int maxindex; @@ -191,7 +191,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double & // multiply minus the original matrix times the new inverse matrix cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m)); // add the identity to the product - cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1)); + cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1)); if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) { // find the maximum absolute value of the residual np_int maxindex; diff --git a/src/libnptm/lapack_calls.cpp b/src/libnptm/lapack_calls.cpp index a23036ce..0b9af94e 100644 --- a/src/libnptm/lapack_calls.cpp +++ b/src/libnptm/lapack_calls.cpp @@ -92,6 +92,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl #endif np_int nn = n*n; np_int incx = 1; + np_int incx0 = 0; #ifdef USE_MKL MKL_Complex16 *arr_orig = NULL; MKL_Complex16 *arr_residual = NULL; @@ -108,17 +109,15 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl arr_orig = new MKL_Complex16[nn]; arr_residual = new MKL_Complex16[nn]; arr_refine = new MKL_Complex16[nn]; - id = new MKL_Complex16[n]; - for (np_int i=0; i0) { np_int maxindex = izamax_(&nn, arr_residual, &incx); @@ -171,7 +170,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx); // zcopy_(&nn, arr_refine, &incx, arr, &incx); zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n); - zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy); + zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy); if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) { np_int maxindex = izamax_(&nn, arr_residual, &incx); #ifdef USE_MKL diff --git a/src/libnptm/magma_calls.cpp b/src/libnptm/magma_calls.cpp index 593374c1..514af4e4 100644 --- a/src/libnptm/magma_calls.cpp +++ b/src/libnptm/magma_calls.cpp @@ -132,16 +132,16 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, } // allocate memory for the identity vector on the host { - dcomplex *native_id = new dcomplex[m]; - for (magma_int_t i=0; i0) { // find the maximum absolute value of the residual @@ -184,7 +184,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, // multiply minus the original matrix times the new inverse matrix magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue); // add the identity to the product - magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue); + magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue); if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) { // find the maximum absolute value of the residual magma_int_t maxindex = magma_izamax(mm, d_a_residual, 1, queue); -- GitLab