diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..25a912b4ace5a9d2a90c69d4accbb3f469a82231 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,28 @@ +authors: + - family-names: La Mura, Mulas, Saija + given-names: Giovanni, Giacomo, Rosalba +cff-version: 1.2.0 +license: GNU GPLv3 +message: "If you use this software, please cite the references below." +repository-code: "https://www.ict.inaf.it/gitlab/giacomo.mulas/np_tmcode" +title: "Nano-particle Transition Matrix code" +version: 8.3 +references: + - authors: + - family-names: Saija, Iatì, Borghese + given-names: Rosalba, Maria Antonia, Ferdinando + doi: 10.1086/322350 + issue: 1 + journal: "The Astrophysical Journal" + title: "Beyond Mie Theory: The Transition Matrix Approach in Interstellar Dust Modeling" + type: article + volume: 599 + page: 993 + year: 2001 + - authors: + - family-names: Borghese, Denti, Saija + given-names: Ferdinando, Paolo, Rosalba + doi: 10.1007/978-3-540-37414-5 + title: "Scattering from Model Nonspherical Particles" + type: book + year: 2007 diff --git a/README.md b/README.md index 472443f23ae2e1a84bd35446739bfb446f401255..88a2a929a0d03e0c122d92e7f7b8dbdcc4ce8f06 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,10 @@ Distributing the code and its sources is possible under the terms of the GNU GPL *NOTE:* The building process requires a working installation of a C++ and a FORTRAN compiler. Many solutions are available, but the recommended option is the *GNU Compiler Collection* `gcc` with the addition of `g++` and `gfortran`. The parallel code implementation further requires the use of parallel compilers complying with the MPI standard (*OpenMPI*, *MPICH*). +# Acknowledgments + +Supported by Italian Research Center on High Performance Computing Big Data and Quantum Computing (ICSC), project funded by European Union - NextGenerationEU - and National Recovery and Resilience Plan (NRRP) - Mission 4 Component 2 within the activities of Spoke 3 (Astrophysics and Cosmos Observations). This work was completed in part at the CINECA GPU HACKATHON 2024, part of the Open Hackathons program. The authors would like to acknowledge OpenACC-Standard.org for their support. + # License Copyright (C) 2024 INAF - Osservatorio Astronomico di Cagliari diff --git a/build/Makefile.in b/build/Makefile.in index 21198afa6b69ced519eecafa68e1fc1dd6beaf08..a82f103bc3a50f7d90d196ef11114d485dd3982b 100644 --- a/build/Makefile.in +++ b/build/Makefile.in @@ -470,9 +470,7 @@ FFLAGS = @FFLAGS@ FGREP = @FGREP@ FILECMD = @FILECMD@ GREP = @GREP@ -HDF5_INCLUDE = @HDF5_INCLUDE@ HDF5_LDFLAGS = @HDF5_LDFLAGS@ -HDF5_LIB = @HDF5_LIB@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ diff --git a/build/configure b/build/configure index 44ef1e6879e8804001c91c1fc199c2ab88307817..67e4ee42698a4e46ecd84deb4a13e4c3c14b60c9 100755 --- a/build/configure +++ b/build/configure @@ -669,8 +669,6 @@ OFFLOADFLAGS BUILDFORTRAN_FALSE BUILDFORTRAN_TRUE HDF5_LDFLAGS -HDF5_LIB -HDF5_INCLUDE CXXCPP LT_SYS_LIBRARY_PATH OTOOL64 @@ -9460,8 +9458,6 @@ EOF if test "x$CXX_IS_MPI" = "x0"; then MPIFLAGS=-DUSE_MPI - MPIFLAGS="" - fi fi @@ -24832,39 +24828,52 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu # Environment setup -if test "x$HDF5_INCLUDE" = "x" -then : - HDF5_INCLUDE="/usr/include/hdf5/serial" -else case e in #( - e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5_INCLUDE=$(HDF5_INCLUDE)" >&5 -printf "%s\n" "$as_me: HDF5_INCLUDE=$(HDF5_INCLUDE)" >&6;} - ;; -esac -fi -if test "x$HDF5_LIB" = "x" -then : - HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial" + export -p | grep HDF5_ROOT > /dev/null + result=$? + if test "x$result" = "x0"; then + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE=${HDF5_ROOT}/include + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB=${HDF5_ROOT}/lib + fi + fi + export -p | grep HDF5_DIR > /dev/null + result=$? + if test "x$result" = "x0"; then + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE=${HDF5_DIR}/include + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB=${HDF5_DIR}/lib + fi + fi + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE="/usr/include/hdf5/serial" + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial" + fi + -else case e in #( - e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5_LIB=$(HDF5_LIB)" >&5 -printf "%s\n" "$as_me: HDF5_LIB=$(HDF5_LIB)" >&6;} - ;; -esac -fi # Check for required headers -as_ac_Header=`printf "%s\n" "ac_cv_header_$HDF5_INCLUDE/hdf5.h" | sed "$as_sed_sh"` -ac_fn_c_check_header_compile "$LINENO" "$HDF5_INCLUDE/hdf5.h" "$as_ac_Header" "$ac_includes_default" -if eval test \"x\$"$as_ac_Header"\" = x"yes" +# AC_CHECK_HEADER( +# [${HDF5_INCLUDE}/hdf5.h], +# , +# AC_MSG_ERROR([Could not find HDF5 headers!]), +# ) +if test -f ${HDF5_INCLUDE}/hdf5.h then : - + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: HDF5 headers found in ${HDF5_INCLUDE}" >&5 +printf "%s\n" "$as_me: HDF5 headers found in ${HDF5_INCLUDE}" >&6;} else case e in #( - e) as_fn_error $? "\"Could not find HDF5 headers!" "$LINENO" 5 ;; + e) as_fn_error $? "HDF5 headers not found!" "$LINENO" 5 + ;; esac fi - # Check for required libraries cat > nptm_test_hdf5.cpp <<EOF @@ -25105,7 +25114,7 @@ then : else - export -p | grep MKL + export -p | grep MKL > /dev/null MKL_DEF=$? if test "x$MKL_DEF" = "x0"; then export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include" @@ -25137,7 +25146,7 @@ fi else case e in #( e) - export -p | grep MKL + export -p | grep MKL > /dev/null MKL_DEF=$? if test "x$MKL_DEF" = "x0"; then export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include" @@ -25191,7 +25200,7 @@ then : elif test -f /usr/include/cuda.h; then CUDAFLAGS="-I/usr/include" CUDALDFLAGS="-lcudart" - elif text "x$CUDA_HOME" != "x"; then + elif test "x$CUDA_HOME" != "x"; then CUDAFLAGS="-I${CUDA_HOME}/include" CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart" fi @@ -25205,6 +25214,9 @@ then : elif test "x$MAGMA_HOME" != "x"; then export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include" export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma" + elif test "x$MAGMA_ROOT" != "x"; then + export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include" + export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma" fi fi @@ -25234,7 +25246,7 @@ else case e in #( elif test -f /usr/include/cuda.h; then CUDAFLAGS="-I/usr/include" CUDALDFLAGS="-lcudart" - elif text "x$CUDA_HOME" != "x"; then + elif test "x$CUDA_HOME" != "x"; then CUDAFLAGS="-I${CUDA_HOME}/include" CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart" fi @@ -25248,6 +25260,9 @@ else case e in #( elif test "x$MAGMA_HOME" != "x"; then export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include" export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma" + elif test "x$MAGMA_ROOT" != "x"; then + export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include" + export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma" fi fi diff --git a/build/configure.ac b/build/configure.ac index 0238f91edac77c5f9a6a61c1f3db6b3a86ec5fac..7ea86e2fd8c6e01e2ddea97608620aafbc0cac7b 100644 --- a/build/configure.ac +++ b/build/configure.ac @@ -19,10 +19,42 @@ EOF ] ) +m4_define( + [M4_DETECT_HDF5], + [ + export -p | grep HDF5_ROOT > /dev/null + result=$? + if test "x$result" = "x0"; then + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE=${HDF5_ROOT}/include + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB=${HDF5_ROOT}/lib + fi + fi + export -p | grep HDF5_DIR > /dev/null + result=$? + if test "x$result" = "x0"; then + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE=${HDF5_DIR}/include + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB=${HDF5_DIR}/lib + fi + fi + if test "x$HDF5_INCLUDE" = "x"; then + export HDF5_INCLUDE="/usr/include/hdf5/serial" + fi + if test "x$HDF5_LIB" = "x"; then + export HDF5_LIB="/usr/lib/x86_64-linux-gnu/hdf5/serial" + fi + ] +) + m4_define( [M4_DETECT_LAPACK], [ - export -p | grep MKL + export -p | grep MKL > /dev/null MKL_DEF=$? if test "x$MKL_DEF" = "x0"; then export LAPACKFLAGS="-DUSE_LAPACK -DUSE_MKL -DLAPACK_ILP64 -DUSE_ILP64 -I{MKLROOT}/include" @@ -45,7 +77,7 @@ m4_define( elif test -f /usr/include/cuda.h; then CUDAFLAGS="-I/usr/include" CUDALDFLAGS="-lcudart" - elif text "x$CUDA_HOME" != "x"; then + elif test "x$CUDA_HOME" != "x"; then CUDAFLAGS="-I${CUDA_HOME}/include" CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcudart" fi @@ -59,6 +91,9 @@ m4_define( elif test "x$MAGMA_HOME" != "x"; then export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_HOME}/include" export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_HOME}/lib -lmagma" + elif test "x$MAGMA_ROOT" != "x"; then + export MAGMAFLAGS="-DUSE_MAGMA -DMAGMA_ILP64 $CUDAFLAGS -I${MAGMA_ROOT}/include" + export MAGMALDFLAGS="$CUDALDFLAGS -L${MAGMA_ROOT}/lib -lmagma" fi fi ] @@ -198,7 +233,6 @@ AC_ARG_ENABLE( M4_TEST_MPI if test "x$CXX_IS_MPI" = "x0"; then AC_SUBST([MPIFLAGS], [-DUSE_MPI]) - AC_SUBST([MPIFLAGS], [""]) fi fi ] @@ -217,22 +251,18 @@ AS_IF( LT_INIT # Environment setup -AS_IF( - [test "x$HDF5_INCLUDE" = "x"], - [AC_SUBST([HDF5_INCLUDE], ["/usr/include/hdf5/serial"])], - [AC_MSG_NOTICE([HDF5_INCLUDE=$(HDF5_INCLUDE)])] -) -AS_IF( - [test "x$HDF5_LIB" = "x"], - [AC_SUBST([HDF5_LIB], ["/usr/lib/x86_64-linux-gnu/hdf5/serial"])], - [AC_MSG_NOTICE([HDF5_LIB=$(HDF5_LIB)])] -) +M4_DETECT_HDF5 # Check for required headers -AC_CHECK_HEADER( - [$HDF5_INCLUDE/hdf5.h], - , - AC_MSG_ERROR(["Could not find HDF5 headers!]), +# AC_CHECK_HEADER( +# [${HDF5_INCLUDE}/hdf5.h], +# , +# AC_MSG_ERROR([Could not find HDF5 headers!]), +# ) +AS_IF( + [test -f ${HDF5_INCLUDE}/hdf5.h], + [AC_MSG_NOTICE([HDF5 headers found in ${HDF5_INCLUDE}])], + [AC_MSG_ERROR([HDF5 headers not found!])] ) # Check for required libraries diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index 73c24667a7ce36a478f20795aab0a93849d514b9..6a2f841fba6b496db255caa32bbc7f47a3a7457b 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -100,16 +100,21 @@ void cluster(const string& config_file, const string& data_file, const string& o chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); chrono::duration<double> elapsed; string message; - string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; - FILE *timing_file = fopen(timing_name.c_str(), "w"); - Logger *time_logger = new Logger(LOG_DEBG, timing_file); + string timing_name; + FILE *timing_file; + Logger *time_logger; + if (mpidata->rank == 0) { + timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; + timing_file = fopen(timing_name.c_str(), "w"); + time_logger = new Logger(LOG_DEBG, timing_file); + } Logger *logger = new Logger(LOG_DEBG); + int device_count = 0; //=========== // Initialise MAGMA //=========== #ifdef USE_MAGMA - int device_count; cudaGetDeviceCount(&device_count); logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG); logger->log("INFO: Process " + to_string(mpidata->rank) + " initializes MAGMA.\n"); @@ -117,8 +122,10 @@ void cluster(const string& config_file, const string& data_file, const string& o if (magma_result != MAGMA_SUCCESS) { logger->err("ERROR: Process " + to_string(mpidata->rank) + " failed to initilize MAGMA.\n"); logger->err("PROC-" + to_string(mpidata->rank) + ": MAGMA error code " + to_string(magma_result) + "\n"); - fclose(timing_file); - delete time_logger; + if (mpidata->rank == 0) { + fclose(timing_file); + delete time_logger; + } delete logger; return; } @@ -188,7 +195,7 @@ void cluster(const string& config_file, const string& data_file, const string& o // in any case, replace all sprintf() with snprintf(), to avoid in any case writing more than the available buffer size char virtual_line[256]; // Create and initialise pristine cid for MPI proc 0 and thread 0 - ClusterIterationData *cid = new ClusterIterationData(gconf, sconf, mpidata); + ClusterIterationData *cid = new ClusterIterationData(gconf, sconf, mpidata, device_count); const int ndi = cid->c4->nsph * cid->c4->nlim; np_int ndit = 2 * ndi; logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)ndit) + " x " + to_string((int64_t)ndit) +".\n"); @@ -330,11 +337,11 @@ void cluster(const string& config_file, const string& data_file, const string& o if (jer != 0) { // First loop failed. Halt the calculation. fclose(timing_file); + delete time_logger; delete p_output; delete p_scattering_angles; delete cid; delete logger; - delete time_logger; delete sconf; delete gconf; return; @@ -542,7 +549,7 @@ void cluster(const string& config_file, const string& data_file, const string& o // copy gconf, sconf, cid and p_scattering_angles from MPI process 0 GeometryConfiguration *gconf = new GeometryConfiguration(mpidata); ScattererConfiguration *sconf = new ScattererConfiguration(mpidata); - ClusterIterationData *cid = new ClusterIterationData(mpidata); + ClusterIterationData *cid = new ClusterIterationData(mpidata, device_count); ScatteringAngles *p_scattering_angles = new ScatteringAngles(mpidata); // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled @@ -569,7 +576,7 @@ void cluster(const string& config_file, const string& data_file, const string& o MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); // receive myMPIstride sent by MPI process 0 to all processes MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD); - // allocate virtualfiles for each thread + // allocate virtual files for each thread p_outarray = new VirtualAsciiFile*[ompnumthreads]; vtppoanarray = new VirtualBinaryFile*[ompnumthreads]; } @@ -592,7 +599,7 @@ void cluster(const string& config_file, const string& data_file, const string& o for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) { // the parallel loop over MPI processes covers a different set of indices for each thread #pragma omp barrier - int myjxi488 = ixi488+myjxi488startoffset+myompthread; + int myjxi488 = ixi488 + myjxi488startoffset + myompthread; // each thread opens new virtual files and stores their pointers in the shared array p_output_2 = new VirtualAsciiFile(); vtppoanp_2 = new VirtualBinaryFile(); @@ -646,8 +653,8 @@ void cluster(const string& config_file, const string& data_file, const string& o logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n"); magma_finalize(); #endif - fclose(timing_file); - delete time_logger; + // fclose(timing_file); + // delete time_logger; delete logger; #ifdef MPI_VERSION } @@ -768,7 +775,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf #ifdef USE_NVTX nvtxRangePush("Invert the matrix"); #endif - invert_matrix(cid->am, ndit, jer, mxndm); + invert_matrix(cid->am, ndit, jer, mxndm, cid->proc_device); #ifdef USE_NVTX nvtxRangePop(); #endif diff --git a/src/include/Commons.h b/src/include/Commons.h index dfa909a9ac6849bc1b5488efe2d16540fa0153d6..1a1de770a39d1e4e35cbccfc7222e073860d26f9 100644 --- a/src/include/Commons.h +++ b/src/include/Commons.h @@ -614,13 +614,15 @@ public: int xiblock; int firstxi; int lastxi; + //! \brief ID of the GPU used by one MPI process. + int proc_device; - ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata); + ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count); ClusterIterationData(const ClusterIterationData& rhs); #ifdef MPI_VERSION - ClusterIterationData(const mixMPI *mpidata); + ClusterIterationData(const mixMPI *mpidata, const int device_count); /*! \brief Broadcast over MPI the ClusterIterationData instance from MPI process 0 to all others. * diff --git a/src/include/algebraic.h b/src/include/algebraic.h index accdf8188139ea955e2409c4b0fe067413c49859..f052bb2d43727b8a3dc22e59d3a96bb570e2dafa 100644 --- a/src/include/algebraic.h +++ b/src/include/algebraic.h @@ -36,7 +36,8 @@ * \param ier: `int &` Reference to an integer variable for returning a result flag. * \param max_size: `np_int` The maximum expected size (required by some call-backs, * optional, defaults to 0). + * \param target_device: `int` ID of target GPU, if available (defaults to 0). */ -void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size=0); +void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size=0, int target_device=0); #endif diff --git a/src/include/magma_calls.h b/src/include/magma_calls.h index bf39c3a5c736b859d5311b16a2d6509f8238eafd..1002d351dac8cdbc41c875e9bfdc818b1b06f54f 100644 --- a/src/include/magma_calls.h +++ b/src/include/magma_calls.h @@ -31,7 +31,8 @@ * \param mat: Matrix of complex. The matrix to be inverted. * \param n: `np_int` The number of rows and columns of the [n x n] matrix. * \param jer: `int &` Reference to an integer return flag. + * \param device_id: `int` ID of the device for matrix inversion offloading. */ -void magma_zinvert(dcomplex **mat, np_int n, int &jer); +void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id=0); #endif diff --git a/src/libnptm/Commons.cpp b/src/libnptm/Commons.cpp index 6d674fe2be67df11160fc400a172c95f5e9b481a..0cecd8e615cbdc552561ac0c14cdf2cf631ad12f 100644 --- a/src/libnptm/Commons.cpp +++ b/src/libnptm/Commons.cpp @@ -955,7 +955,7 @@ mixMPI::mixMPI(const mixMPI& rhs) { mixMPI::~mixMPI() { } -ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata) { +ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count) { c1 = new C1(gconf, sconf); c2 = new C2(gconf, sconf); c3 = new C3(); @@ -1054,6 +1054,12 @@ ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, Scatter lastxi = ((mpidata->rank+1) * xiblock)+1; firstxi = lastxi-xiblock+1; if (lastxi > sconf->number_of_scales) lastxi = sconf->number_of_scales; + +#ifdef USE_MAGMA + proc_device = mpidata->rank % device_count; +#else + proc_device = 0; +#endif } ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) { @@ -1202,10 +1208,12 @@ ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) { lastxi = rhs.lastxi; xiblock = rhs.xiblock; number_of_scales = rhs.number_of_scales; + + proc_device = rhs.proc_device; } #ifdef MPI_VERSION -ClusterIterationData::ClusterIterationData(const mixMPI *mpidata) { +ClusterIterationData::ClusterIterationData(const mixMPI *mpidata, const int device_count) { c1 = new C1(mpidata); c2 = new C2(mpidata); c3 = new C3(mpidata); @@ -1336,6 +1344,12 @@ ClusterIterationData::ClusterIterationData(const mixMPI *mpidata) { lastxi = ((mpidata->rank+1) * xiblock)+1; firstxi = lastxi-xiblock+1; if (lastxi > number_of_scales) lastxi = number_of_scales; + +#ifdef USE_MAGMA + proc_device = mpidata->rank % device_count; +#else + proc_device = 0; +#endif } void ClusterIterationData::mpibcast(const mixMPI *mpidata) { diff --git a/src/libnptm/algebraic.cpp b/src/libnptm/algebraic.cpp index c942a1fdd3a4f242fd409a6390975da344d93cee..c25ea0aa92dd1d7f9f9d27b13f6be50e82f7506d 100644 --- a/src/libnptm/algebraic.cpp +++ b/src/libnptm/algebraic.cpp @@ -44,10 +44,10 @@ extern void lucin(dcomplex **mat, np_int max_size, np_int size, int &ier); using namespace std; -void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size) { +void invert_matrix(dcomplex **mat, np_int size, int &ier, np_int max_size, int target_device) { ier = 0; #ifdef USE_MAGMA - magma_zinvert(mat, size, ier); + magma_zinvert(mat, size, ier, target_device); #elif defined USE_LAPACK zinvert(mat, size, ier); #else diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 1712036ade1d3cb770ce91d1f03d29b0e5565925..42d136b28844f9ddb2a13f1909e355d1e742918e 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -2015,7 +2015,9 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba outer loop 2"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for +#endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; diff --git a/src/libnptm/magma_calls.cpp b/src/libnptm/magma_calls.cpp index d9875d9bae54e25b974e337a2f11c3152fd410c4..4cbae602b473d145325c9c529f91fd1e550b1453 100644 --- a/src/libnptm/magma_calls.cpp +++ b/src/libnptm/magma_calls.cpp @@ -27,19 +27,19 @@ #include "../include/magma_calls.h" #endif -void magma_zinvert(dcomplex **mat, np_int n, int &jer) { +void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id) { // magma_int_t result = magma_init(); magma_int_t err = MAGMA_SUCCESS; magma_queue_t queue = NULL; - magma_device_t dev = 0; + magma_device_t dev = (magma_device_t)device_id; magma_queue_create(dev, &queue); - magmaDoubleComplex *dwork; // dwork - workspace + magmaDoubleComplex *dwork; // workspace magma_int_t ldwork; // size of dwork - magma_int_t *piv , info; // piv - array of indices of inter - + magma_int_t *piv , info; // array of pivot indices magma_int_t m = (magma_int_t)n; // changed rows; a - mxm matrix - magma_int_t mm = m * m; // size of a, r, c - magmaDoubleComplex *a = (magmaDoubleComplex *)&(mat[0][0]); // a - mxm matrix on the host - magmaDoubleComplex *d_a; // d_a - mxm matrix a on the device + magma_int_t mm = m * m; // size of a + magmaDoubleComplex *a = (magmaDoubleComplex *)&(mat[0][0]); // pointer to first element on host + magmaDoubleComplex *d_a; // pointer to first element on device ldwork = m * magma_get_zgetri_nb(m); // optimal block size // allocate matrices err = magma_zmalloc(&d_a, mm); // device memory for a @@ -50,7 +50,7 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer) { magma_zgetrf_gpu(m, m, d_a, m, piv, &info); magma_zgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info); - magma_zgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a -> a + magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a delete[] piv; // free host memory magma_free(d_a); // free device memory magma_queue_destroy(queue); // destroy queue diff --git a/src/sphere/sphere.cpp b/src/sphere/sphere.cpp index 020a176c873b610809c376f868212cd18dd1b1c4..c0e789ff35ed729e2323e551d4fdf01b1d091de1 100644 --- a/src/sphere/sphere.cpp +++ b/src/sphere/sphere.cpp @@ -312,11 +312,12 @@ void sphere(const string& config_file, const string& data_file, const string& ou } continue; // i132 } + // label 125 int nsh = c1->nshl[i132]; int ici = (nsh + 1) / 2; if (idfc == 0) { for (int ic = 0; ic < ici; ic++) - c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132, 0); // WARNING: IDFC=0 is not tested! + c2->dc0[ic] = sconf->get_dielectric_constant(ic, i132, jxi488); // WARNING: IDFC=0 is not tested! } else { // IDFC != 0 if (jxi == 1) { for (int ic = 0; ic < ici; ic++) {