diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..25a912b4ace5a9d2a90c69d4accbb3f469a82231
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,28 @@
+authors:
+  - family-names: La Mura, Mulas, Saija
+    given-names: Giovanni, Giacomo, Rosalba
+cff-version: 1.2.0
+license: GNU GPLv3
+message: "If you use this software, please cite the references below."
+repository-code: "https://www.ict.inaf.it/gitlab/giacomo.mulas/np_tmcode"
+title: "Nano-particle Transition Matrix code"
+version: 8.3
+references:
+  - authors:
+    - family-names: Saija, Iatì, Borghese
+      given-names: Rosalba, Maria Antonia, Ferdinando
+    doi: 10.1086/322350
+    issue: 1
+    journal: "The Astrophysical Journal"
+    title: "Beyond Mie Theory: The Transition Matrix Approach in Interstellar Dust Modeling"
+    type: article
+    volume: 599
+    page: 993
+    year: 2001
+  - authors:
+    - family-names: Borghese, Denti, Saija
+      given-names: Ferdinando, Paolo, Rosalba
+    doi: 10.1007/978-3-540-37414-5
+    title: "Scattering from Model Nonspherical Particles"
+    type: book
+    year: 2007
diff --git a/README.md b/README.md
index 472443f23ae2e1a84bd35446739bfb446f401255..88a2a929a0d03e0c122d92e7f7b8dbdcc4ce8f06 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,10 @@ Distributing the code and its sources is possible under the terms of the GNU GPL
 
 *NOTE:* The building process requires a working installation of a C++ and a FORTRAN compiler. Many solutions are available, but the recommended option is the *GNU Compiler Collection* `gcc` with the addition of `g++` and `gfortran`. The parallel code implementation further requires the use of parallel compilers complying with the MPI standard (*OpenMPI*, *MPICH*).
 
+# Acknowledgments
+
+Supported by Italian Research Center on High Performance Computing Big Data and Quantum Computing (ICSC), project funded by European Union - NextGenerationEU - and National Recovery and Resilience Plan (NRRP) - Mission 4 Component 2 within the activities of Spoke 3 (Astrophysics and Cosmos Observations). This work was completed in part at the CINECA GPU HACKATHON 2024, part of the Open Hackathons program. The authors would like to acknowledge OpenACC-Standard.org for their support.
+
 # License
 
    Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari
diff --git a/build/Makefile.in b/build/Makefile.in
index 21198afa6b69ced519eecafa68e1fc1dd6beaf08..a82f103bc3a50f7d90d196ef11114d485dd3982b 100644
--- a/build/Makefile.in
+++ b/build/Makefile.in
@@ -470,9 +470,7 @@ FFLAGS = @FFLAGS@
 FGREP = @FGREP@
 FILECMD = @FILECMD@
 GREP = @GREP@
-HDF5_INCLUDE = @HDF5_INCLUDE@
 HDF5_LDFLAGS = @HDF5_LDFLAGS@
-HDF5_LIB = @HDF5_LIB@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/build/configure b/build/configure
index 44ef1e6879e8804001c91c1fc199c2ab88307817..67e4ee42698a4e46ecd84deb4a13e4c3c14b60c9 100755
--- a/build/configure
+++ b/build/configure
@@ -669,8 +669,6 @@ OFFLOADFLAGS
 BUILDFORTRAN_FALSE
 BUILDFORTRAN_TRUE
 HDF5_LDFLAGS
-HDF5_LIB
-HDF5_INCLUDE
 CXXCPP
 LT_SYS_LIBRARY_PATH
 OTOOL64
@@ -9460,8 +9458,6 @@ EOF
       if test "x$CXX_IS_MPI" = "x0"; then
         MPIFLAGS=-DUSE_MPI
 
-        MPIFLAGS=""
-
       fi
     fi
 
@@ -24832,39 +24828,52 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 # Environment setup
-if test "x$HDF5_INCLUDE" = "x"
-then :
-  HDF5_INCLUDE="/usr/include/hdf5/serial"
 
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5_INCLUDE=$(HDF5_INCLUDE)" >&5
-printf "%s\n" "$as_me: HDF5_INCLUDE=$(HDF5_INCLUDE)" >&6;}
- ;;
-esac
-fi
-if test "x$HDF5_LIB" = "x"
-then :
-  HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial"
+    export -p | grep HDF5_ROOT > /dev/null
+    result=$?
+    if test "x$result" = "x0"; then
+      if test "x$HDF5_INCLUDE" = "x"; then
+        export HDF5_INCLUDE=${HDF5_ROOT}/include
+      fi
+      if test "x$HDF5_LIB" = "x"; then
+        export HDF5_LIB=${HDF5_ROOT}/lib
+      fi
+    fi
+    export -p | grep HDF5_DIR > /dev/null
+    result=$?
+    if test "x$result" = "x0"; then
+      if test "x$HDF5_INCLUDE" = "x"; then
+        export HDF5_INCLUDE=${HDF5_DIR}/include
+      fi
+      if test "x$HDF5_LIB" = "x"; then
+        export HDF5_LIB=${HDF5_DIR}/lib
+      fi
+    fi
+    if test "x$HDF5_INCLUDE" = "x"; then
+      export HDF5_INCLUDE="/usr/include/hdf5/serial"
+    fi
+    if test "x$HDF5_LIB" = "x"; then
+      export HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial"
+    fi
+
 
-else case e in #(
-  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5_LIB=$(HDF5_LIB)" >&5
-printf "%s\n" "$as_me: HDF5_LIB=$(HDF5_LIB)" >&6;}
- ;;
-esac
-fi
 
 # Check for required headers
-as_ac_Header=`printf "%s\n" "ac_cv_header_$HDF5_INCLUDE/hdf5.h" | sed "$as_sed_sh"`
-ac_fn_c_check_header_compile "$LINENO" "$HDF5_INCLUDE/hdf5.h" "$as_ac_Header" "$ac_includes_default"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"
+# AC_CHECK_HEADER(
+#  [${HDF5_INCLUDE}/hdf5.h],
+#  ,
+#  AC_MSG_ERROR([Could not find HDF5 headers!]),
+# )
+if test -f ${HDF5_INCLUDE}/hdf5.h
 then :
-
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5 headers found in ${HDF5_INCLUDE}" >&5
+printf "%s\n" "$as_me: HDF5 headers found in ${HDF5_INCLUDE}" >&6;}
 else case e in #(
-  e) as_fn_error $? "\"Could not find HDF5 headers!" "$LINENO" 5 ;;
+  e) as_fn_error $? "HDF5 headers not found!" "$LINENO" 5
+ ;;
 esac
 fi
 
-
 # Check for required libraries
 
     cat > nptm_test_hdf5.cpp <<EOF
@@ -25105,7 +25114,7 @@ then :
 
     else
 
-    export -p | grep MKL
+    export -p | grep MKL > /dev/null
     MKL_DEF=$?
     if test "x$MKL_DEF" = "x0"; then
       export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include"
@@ -25137,7 +25146,7 @@ fi
 else case e in #(
   e)
 
-    export -p | grep MKL
+    export -p | grep MKL > /dev/null
     MKL_DEF=$?
     if test "x$MKL_DEF" = "x0"; then
       export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include"
@@ -25191,7 +25200,7 @@ then :
     elif test -f /usr/include/cuda.h; then
       CUDAFLAGS="-I/usr/include"
       CUDALDFLAGS="-lcudart"
-    elif text "x$CUDA_HOME" != "x"; then
+    elif test "x$CUDA_HOME" != "x"; then
       CUDAFLAGS="-I${CUDA_HOME}/include"
       CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart"
     fi
@@ -25205,6 +25214,9 @@ then :
       elif test "x$MAGMA_HOME" != "x"; then
         export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include"
 	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma"
+      elif test "x$MAGMA_ROOT" != "x"; then
+        export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include"
+	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma"
       fi
     fi
 
@@ -25234,7 +25246,7 @@ else case e in #(
     elif test -f /usr/include/cuda.h; then
       CUDAFLAGS="-I/usr/include"
       CUDALDFLAGS="-lcudart"
-    elif text "x$CUDA_HOME" != "x"; then
+    elif test "x$CUDA_HOME" != "x"; then
       CUDAFLAGS="-I${CUDA_HOME}/include"
       CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart"
     fi
@@ -25248,6 +25260,9 @@ else case e in #(
       elif test "x$MAGMA_HOME" != "x"; then
         export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include"
 	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma"
+      elif test "x$MAGMA_ROOT" != "x"; then
+        export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include"
+	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma"
       fi
     fi
 
diff --git a/build/configure.ac b/build/configure.ac
index 0238f91edac77c5f9a6a61c1f3db6b3a86ec5fac..7ea86e2fd8c6e01e2ddea97608620aafbc0cac7b 100644
--- a/build/configure.ac
+++ b/build/configure.ac
@@ -19,10 +19,42 @@ EOF
   ]
 )
 
+m4_define(
+  [M4_DETECT_HDF5],
+  [
+    export -p | grep HDF5_ROOT > /dev/null
+    result=$?
+    if test "x$result" = "x0"; then
+      if test "x$HDF5_INCLUDE" = "x"; then
+        export HDF5_INCLUDE=${HDF5_ROOT}/include
+      fi
+      if test "x$HDF5_LIB" = "x"; then
+        export HDF5_LIB=${HDF5_ROOT}/lib
+      fi
+    fi
+    export -p | grep HDF5_DIR > /dev/null
+    result=$?
+    if test "x$result" = "x0"; then
+      if test "x$HDF5_INCLUDE" = "x"; then
+        export HDF5_INCLUDE=${HDF5_DIR}/include
+      fi
+      if test "x$HDF5_LIB" = "x"; then
+        export HDF5_LIB=${HDF5_DIR}/lib
+      fi
+    fi
+    if test "x$HDF5_INCLUDE" = "x"; then
+      export HDF5_INCLUDE="/usr/include/hdf5/serial"
+    fi
+    if test "x$HDF5_LIB" = "x"; then
+      export HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial"
+    fi
+  ]
+)
+
 m4_define(
   [M4_DETECT_LAPACK],
   [
-    export -p | grep MKL
+    export -p | grep MKL > /dev/null
     MKL_DEF=$?
     if test "x$MKL_DEF" = "x0"; then
       export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include"
@@ -45,7 +77,7 @@ m4_define(
     elif test -f /usr/include/cuda.h; then
       CUDAFLAGS="-I/usr/include"
       CUDALDFLAGS="-lcudart"
-    elif text "x$CUDA_HOME" != "x"; then
+    elif test "x$CUDA_HOME" != "x"; then
       CUDAFLAGS="-I${CUDA_HOME}/include"
       CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart"
     fi
@@ -59,6 +91,9 @@ m4_define(
       elif test "x$MAGMA_HOME" != "x"; then
         export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include"
 	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma"
+      elif test "x$MAGMA_ROOT" != "x"; then
+        export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include"
+	export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma"
       fi
     fi
   ]
@@ -198,7 +233,6 @@ AC_ARG_ENABLE(
       M4_TEST_MPI
       if test "x$CXX_IS_MPI" = "x0"; then
         AC_SUBST([MPIFLAGS], [-DUSE_MPI])
-        AC_SUBST([MPIFLAGS], [""])
       fi
     fi
   ]
@@ -217,22 +251,18 @@ AS_IF(
 LT_INIT
 
 # Environment setup
-AS_IF(
-  [test "x$HDF5_INCLUDE" = "x"],
-  [AC_SUBST([HDF5_INCLUDE], ["/usr/include/hdf5/serial"])],
-  [AC_MSG_NOTICE([HDF5_INCLUDE=$(HDF5_INCLUDE)])]
-)
-AS_IF(
-  [test "x$HDF5_LIB" = "x"],
-  [AC_SUBST([HDF5_LIB], ["/usr/lib/x86_64-linux-gnu/hdf5/serial"])],
-  [AC_MSG_NOTICE([HDF5_LIB=$(HDF5_LIB)])]
-)
+M4_DETECT_HDF5
 
 # Check for required headers
-AC_CHECK_HEADER(
-  [$HDF5_INCLUDE/hdf5.h],
-  ,
-  AC_MSG_ERROR(["Could not find HDF5 headers!]),
+# AC_CHECK_HEADER(
+#  [${HDF5_INCLUDE}/hdf5.h],
+#  ,
+#  AC_MSG_ERROR([Could not find HDF5 headers!]),
+# )
+AS_IF(
+  [test -f ${HDF5_INCLUDE}/hdf5.h],
+  [AC_MSG_NOTICE([HDF5 headers found in ${HDF5_INCLUDE}])],
+  [AC_MSG_ERROR([HDF5 headers not found!])]
 )
 
 # Check for required libraries
diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index 73c24667a7ce36a478f20795aab0a93849d514b9..6a2f841fba6b496db255caa32bbc7f47a3a7457b 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -100,16 +100,21 @@ void cluster(const string& config_file, const string& data_file, const string& o
   chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
   chrono::duration<double> elapsed;
   string message;
-  string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
-  FILE *timing_file = fopen(timing_name.c_str(), "w");
-  Logger *time_logger = new Logger(LOG_DEBG, timing_file);
+  string timing_name;
+  FILE *timing_file;
+  Logger *time_logger;
+  if (mpidata->rank == 0) {
+    timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
+    timing_file = fopen(timing_name.c_str(), "w");
+    time_logger = new Logger(LOG_DEBG, timing_file);
+  }
   Logger *logger = new Logger(LOG_DEBG);
+  int device_count = 0;
 
   //===========
   // Initialise MAGMA
   //===========
 #ifdef USE_MAGMA
-  int device_count;
   cudaGetDeviceCount(&device_count);
   logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG);
   logger->log("INFO: Process " + to_string(mpidata->rank) + " initializes MAGMA.\n");
@@ -117,8 +122,10 @@ void cluster(const string& config_file, const string& data_file, const string& o
   if (magma_result != MAGMA_SUCCESS) {
     logger->err("ERROR: Process " + to_string(mpidata->rank) + " failed to initilize MAGMA.\n");
     logger->err("PROC-" + to_string(mpidata->rank) + ": MAGMA error code " + to_string(magma_result) + "\n");
-    fclose(timing_file);
-    delete time_logger;
+    if (mpidata->rank == 0) {
+      fclose(timing_file);
+      delete time_logger;
+    }
     delete logger;
     return;
   }
@@ -188,7 +195,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
       // in any case, replace all sprintf() with snprintf(), to avoid in any case writing more than the available buffer size
       char virtual_line[256];
       // Create and initialise pristine cid for MPI proc 0 and thread 0
-      ClusterIterationData *cid = new ClusterIterationData(gconf, sconf, mpidata);
+      ClusterIterationData *cid = new ClusterIterationData(gconf, sconf, mpidata, device_count);
       const int ndi = cid->c4->nsph * cid->c4->nlim;
       np_int ndit = 2 * ndi;
       logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)ndit) + " x " + to_string((int64_t)ndit) +".\n");
@@ -330,11 +337,11 @@ void cluster(const string& config_file, const string& data_file, const string& o
       if (jer != 0) {
 	// First loop failed. Halt the calculation.
 	fclose(timing_file);
+	delete time_logger;
 	delete p_output;
 	delete p_scattering_angles;
 	delete cid;
 	delete logger;
-	delete time_logger;
 	delete sconf;
 	delete gconf;
 	return;
@@ -542,7 +549,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
     // copy gconf, sconf, cid and p_scattering_angles from MPI process 0
     GeometryConfiguration *gconf = new GeometryConfiguration(mpidata);
     ScattererConfiguration *sconf = new ScattererConfiguration(mpidata);
-    ClusterIterationData *cid = new ClusterIterationData(mpidata);
+    ClusterIterationData *cid = new ClusterIterationData(mpidata, device_count);
     ScatteringAngles *p_scattering_angles = new ScatteringAngles(mpidata);
 
     // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
@@ -569,7 +576,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
 	// receive myMPIstride sent by MPI process 0 to all processes
 	MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD);
-	// allocate virtualfiles for each thread
+	// allocate virtual files for each thread
 	p_outarray = new VirtualAsciiFile*[ompnumthreads];
 	vtppoanarray = new VirtualBinaryFile*[ompnumthreads];
       }
@@ -592,7 +599,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
       for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
 	// the parallel loop over MPI processes covers a different set of indices for each thread
 #pragma omp barrier
-	int myjxi488 = ixi488+myjxi488startoffset+myompthread;
+	int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
 	// each thread opens new virtual files and stores their pointers in the shared array
 	p_output_2 = new VirtualAsciiFile();
 	vtppoanp_2 = new VirtualBinaryFile();
@@ -646,8 +653,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
     logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n");
     magma_finalize();
 #endif
-    fclose(timing_file);
-    delete time_logger;
+    // fclose(timing_file);
+    // delete time_logger;
     delete logger;
 #ifdef MPI_VERSION
   }
@@ -768,7 +775,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
 #ifdef USE_NVTX
   nvtxRangePush("Invert the matrix");
 #endif
-  invert_matrix(cid->am, ndit, jer, mxndm);
+  invert_matrix(cid->am, ndit, jer, mxndm, cid->proc_device);
 #ifdef USE_NVTX
   nvtxRangePop();
 #endif
diff --git a/src/include/Commons.h b/src/include/Commons.h
index dfa909a9ac6849bc1b5488efe2d16540fa0153d6..1a1de770a39d1e4e35cbccfc7222e073860d26f9 100644
--- a/src/include/Commons.h
+++ b/src/include/Commons.h
@@ -614,13 +614,15 @@ public:
   int xiblock;
   int firstxi;
   int lastxi;
+  //! \brief ID of the GPU used by one MPI process.
+  int proc_device;
 
-  ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata);
+  ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count);
   
   ClusterIterationData(const ClusterIterationData& rhs);
 
 #ifdef MPI_VERSION
-  ClusterIterationData(const mixMPI *mpidata);
+  ClusterIterationData(const mixMPI *mpidata, const int device_count);
 
   /*! \brief Broadcast over MPI the ClusterIterationData instance from MPI process 0 to all others.
    *
diff --git a/src/include/algebraic.h b/src/include/algebraic.h
index accdf8188139ea955e2409c4b0fe067413c49859..f052bb2d43727b8a3dc22e59d3a96bb570e2dafa 100644
--- a/src/include/algebraic.h
+++ b/src/include/algebraic.h
@@ -36,7 +36,8 @@
  * \param ier: `int &` Reference to an integer variable for returning a result flag.
  * \param max_size: `np_int` The maximum expected size (required by some call-backs,
  * optional, defaults to 0).
+ * \param target_device: `int` ID of target GPU, if available (defaults to 0).
  */
-void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size=0);
+void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size=0, int target_device=0);
 
 #endif
diff --git a/src/include/magma_calls.h b/src/include/magma_calls.h
index bf39c3a5c736b859d5311b16a2d6509f8238eafd..1002d351dac8cdbc41c875e9bfdc818b1b06f54f 100644
--- a/src/include/magma_calls.h
+++ b/src/include/magma_calls.h
@@ -31,7 +31,8 @@
  * \param mat: Matrix of complex. The matrix to be inverted.
  * \param n: `np_int` The number of rows and columns of the [n x n] matrix.
  * \param jer: `int &` Reference to an integer return flag.
+ * \param device_id: `int` ID of the device for matrix inversion offloading.
  */
-void magma_zinvert(dcomplex **mat, np_int n, int &jer);
+void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id=0);
 
 #endif
diff --git a/src/libnptm/Commons.cpp b/src/libnptm/Commons.cpp
index 6d674fe2be67df11160fc400a172c95f5e9b481a..0cecd8e615cbdc552561ac0c14cdf2cf631ad12f 100644
--- a/src/libnptm/Commons.cpp
+++ b/src/libnptm/Commons.cpp
@@ -955,7 +955,7 @@ mixMPI::mixMPI(const mixMPI& rhs) {
 mixMPI::~mixMPI() {
 }
 
-ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata) {
+ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count) {
   c1 = new C1(gconf, sconf);
   c2 = new C2(gconf, sconf);
   c3 = new C3();
@@ -1054,6 +1054,12 @@ ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, Scatter
   lastxi = ((mpidata->rank+1) * xiblock)+1;
   firstxi = lastxi-xiblock+1;
   if (lastxi > sconf->number_of_scales) lastxi = sconf->number_of_scales;
+
+#ifdef USE_MAGMA
+  proc_device = mpidata->rank % device_count;
+#else
+  proc_device = 0;
+#endif
 }
 
 ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) {
@@ -1202,10 +1208,12 @@ ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) {
   lastxi = rhs.lastxi;
   xiblock = rhs.xiblock;
   number_of_scales = rhs.number_of_scales;
+
+  proc_device = rhs.proc_device;
 }
 
 #ifdef MPI_VERSION
-ClusterIterationData::ClusterIterationData(const mixMPI *mpidata) {
+ClusterIterationData::ClusterIterationData(const mixMPI *mpidata, const int device_count) {
   c1 = new C1(mpidata);
   c2 = new C2(mpidata);
   c3 = new C3(mpidata);
@@ -1336,6 +1344,12 @@ ClusterIterationData::ClusterIterationData(const mixMPI *mpidata) {
   lastxi = ((mpidata->rank+1) * xiblock)+1;
   firstxi = lastxi-xiblock+1;
   if (lastxi > number_of_scales) lastxi = number_of_scales;
+
+#ifdef USE_MAGMA
+  proc_device = mpidata->rank % device_count;
+#else
+  proc_device = 0;
+#endif
 }
 
 void ClusterIterationData::mpibcast(const mixMPI *mpidata) {
diff --git a/src/libnptm/algebraic.cpp b/src/libnptm/algebraic.cpp
index c942a1fdd3a4f242fd409a6390975da344d93cee..c25ea0aa92dd1d7f9f9d27b13f6be50e82f7506d 100644
--- a/src/libnptm/algebraic.cpp
+++ b/src/libnptm/algebraic.cpp
@@ -44,10 +44,10 @@ extern void lucin(dcomplex **mat, np_int max_size, np_int size, int &ier);
 
 using namespace std;
 
-void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size) {
+void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size, int target_device) {
   ier = 0;
 #ifdef USE_MAGMA
-  magma_zinvert(mat, size, ier);
+  magma_zinvert(mat, size, ier, target_device);
 #elif defined USE_LAPACK
   zinvert(mat, size, ier);
 #else
diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index 1712036ade1d3cb770ce91d1f03d29b0e5565925..42d136b28844f9ddb2a13f1909e355d1e742918e 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -2015,7 +2015,9 @@ void raba(
 #ifdef USE_NVTX
   nvtxRangePush("raba outer loop 2");
 #endif
+#ifdef USE_TARGET_OFFLOAD
 #pragma omp teams distribute parallel for
+#endif
   for (int ipo = 0; ipo < 2; ipo++) {
     int jpo = 1 - ipo;
     ctqce[ipo][0] = cc0;
diff --git a/src/libnptm/magma_calls.cpp b/src/libnptm/magma_calls.cpp
index d9875d9bae54e25b974e337a2f11c3152fd410c4..4cbae602b473d145325c9c529f91fd1e550b1453 100644
--- a/src/libnptm/magma_calls.cpp
+++ b/src/libnptm/magma_calls.cpp
@@ -27,19 +27,19 @@
 #include "../include/magma_calls.h"
 #endif
 
-void magma_zinvert(dcomplex **mat, np_int n, int &jer) {
+void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id) {
   // magma_int_t result = magma_init();
   magma_int_t err = MAGMA_SUCCESS;
   magma_queue_t queue = NULL;
-  magma_device_t dev = 0;
+  magma_device_t dev = (magma_device_t)device_id;
   magma_queue_create(dev, &queue);
-  magmaDoubleComplex *dwork; // dwork - workspace
+  magmaDoubleComplex *dwork; // workspace
   magma_int_t ldwork; // size of dwork
-  magma_int_t *piv , info; // piv - array of indices of inter -
+  magma_int_t *piv , info; // array of pivot indices
   magma_int_t m = (magma_int_t)n; // changed rows; a - mxm matrix
-  magma_int_t mm = m * m; // size of a, r, c
-  magmaDoubleComplex *a = (magmaDoubleComplex *)&(mat[0][0]); // a - mxm matrix on the host
-  magmaDoubleComplex *d_a; // d_a - mxm matrix a on the device
+  magma_int_t mm = m * m; // size of a
+  magmaDoubleComplex *a = (magmaDoubleComplex *)&(mat[0][0]); // pointer to first element on host
+  magmaDoubleComplex *d_a; // pointer to first element on device
   ldwork = m * magma_get_zgetri_nb(m); // optimal block size
   // allocate matrices
   err = magma_zmalloc(&d_a, mm); // device memory for a
@@ -50,7 +50,7 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer) {
   magma_zgetrf_gpu(m, m, d_a, m, piv, &info);
   magma_zgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
   
-  magma_zgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a -> a
+  magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a
   delete[] piv; // free host memory
   magma_free(d_a); // free device memory
   magma_queue_destroy(queue); // destroy queue
diff --git a/src/sphere/sphere.cpp b/src/sphere/sphere.cpp
index 020a176c873b610809c376f868212cd18dd1b1c4..c0e789ff35ed729e2323e551d4fdf01b1d091de1 100644
--- a/src/sphere/sphere.cpp
+++ b/src/sphere/sphere.cpp
@@ -312,11 +312,12 @@ void sphere(const string& config_file, const string& data_file, const string& ou
 	    }
 	    continue; // i132
 	  }
+	  // label 125
 	  int nsh = c1->nshl[i132];
 	  int ici = (nsh + 1) / 2;
 	  if (idfc == 0) {
 	    for (int ic = 0; ic < ici; ic++)
-	      c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132, 0); // WARNING: IDFC=0 is not tested!
+	      c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132, jxi488); // WARNING: IDFC=0 is not tested!
 	  } else { // IDFC != 0
 	    if (jxi == 1) {
 	      for (int ic = 0; ic < ici; ic++) {