diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 24a87265b60e3188af7c525164f1e0e87d9f0d66..475daf9ac3eaf133a384834d7b0e7aafd54d67c2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -154,7 +154,7 @@ building_stage: - cat /etc/os-release - cd build - echo "Configuring with default compilers (MAGMA disabled)..." - - ./configure --without-magma --disable-offload --enable-refinement + - ./configure --without-magma --without-cublas --disable-offload --enable-refinement - make clean - echo "Building the default configuration..." - make -j diff --git a/build/Makefile.am b/build/Makefile.am index cbfcd574ce12fce5a71b66fdd9f46ca7a5636f2d..d9b8fa1e997303ba95954eb1f7187c214edbd2a3 100644 --- a/build/Makefile.am +++ b/build/Makefile.am @@ -1,4 +1,4 @@ -LDADD=libnptm/libnptm.la -L/usr/lib64 ${USER_LDFLAGS} ${HDF5_LDFLAGS} ${LAPACKLDFLAGS} ${BLASLDFLAGS} ${MAGMALDFLAGS} +LDADD=libnptm/libnptm.la -L/usr/lib64 ${USER_LDFLAGS} ${HDF5_LDFLAGS} ${LAPACKLDFLAGS} ${BLASLDFLAGS} ${CUBLASLDFLAGS} ${MAGMALDFLAGS} lib_LTLIBRARIES=libnptm/libnptm.la libnptm_libnptm_la_SOURCES=../src/libnptm/algebraic.cpp ../src/libnptm/clu_subs.cpp ../src/libnptm/Commons.cpp ../src/libnptm/Configuration.cpp ../src/libnptm/file_io.cpp ../src/libnptm/inclu_subs.cpp ../src/libnptm/lapack_calls.cpp ../src/libnptm/logging.cpp ../src/libnptm/magma_calls.cpp ../src/libnptm/cublas_calls.cpp ../src/libnptm/Parsers.cpp ../src/libnptm/sph_subs.cpp ../src/libnptm/utils.cpp ../src/libnptm/tfrfme.cpp ../src/libnptm/TransitionMatrix.cpp ../src/libnptm/tra_subs.cpp if BUILDFORTRAN diff --git a/build/Makefile.in b/build/Makefile.in index bdddb965366c8a04c4b81e0a1b71f99a84029dfb..077a305569f613095c2b0f2ad174191c4adf6aa0 100644 --- a/build/Makefile.in +++ b/build/Makefile.in @@ -162,9 +162,10 @@ am_libnptm_libnptm_la_OBJECTS = ../src/libnptm/algebraic.lo \ ../src/libnptm/Configuration.lo ../src/libnptm/file_io.lo \ ../src/libnptm/inclu_subs.lo ../src/libnptm/lapack_calls.lo \ ../src/libnptm/logging.lo ../src/libnptm/magma_calls.lo \ - ../src/libnptm/Parsers.lo ../src/libnptm/sph_subs.lo \ - ../src/libnptm/utils.lo ../src/libnptm/tfrfme.lo \ - ../src/libnptm/TransitionMatrix.lo ../src/libnptm/tra_subs.lo + ../src/libnptm/cublas_calls.lo ../src/libnptm/Parsers.lo \ + ../src/libnptm/sph_subs.lo ../src/libnptm/utils.lo \ + ../src/libnptm/tfrfme.lo ../src/libnptm/TransitionMatrix.lo \ + ../src/libnptm/tra_subs.lo libnptm_libnptm_la_OBJECTS = $(am_libnptm_libnptm_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -178,7 +179,8 @@ cluster_clu_LDADD = $(LDADD) am__DEPENDENCIES_1 = cluster_clu_DEPENDENCIES = libnptm/libnptm.la $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_1) am__cluster_edfb_clu_SOURCES_DIST = ../src/cluster/edfb_clu.f @BUILDFORTRAN_TRUE@am__objects_2 = ../src/cluster/edfb_clu.$(OBJEXT) @BUILDFORTRAN_TRUE@am_cluster_edfb_clu_OBJECTS = $(am__objects_2) @@ -187,7 +189,7 @@ cluster_edfb_clu_LDADD = $(LDADD) cluster_edfb_clu_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__cluster_np_cluster_SOURCES_DIST = ../src/cluster/np_cluster.cpp \ ../src/cluster/cluster.cpp @BUILDFORTRAN_FALSE@am_cluster_np_cluster_OBJECTS = \ @@ -201,7 +203,7 @@ cluster_np_cluster_LDADD = $(LDADD) cluster_np_cluster_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__inclusion_edfb_inclu_SOURCES_DIST = ../src/inclusion/edfb_inclu.f @BUILDFORTRAN_TRUE@am__objects_3 = \ @BUILDFORTRAN_TRUE@ ../src/inclusion/edfb_inclu.$(OBJEXT) @@ -211,7 +213,7 @@ inclusion_edfb_inclu_LDADD = $(LDADD) inclusion_edfb_inclu_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__inclusion_inclu_SOURCES_DIST = ../src/inclusion/inclu.f @BUILDFORTRAN_TRUE@am__objects_4 = ../src/inclusion/inclu.$(OBJEXT) @BUILDFORTRAN_TRUE@am_inclusion_inclu_OBJECTS = $(am__objects_4) @@ -220,7 +222,7 @@ inclusion_inclu_LDADD = $(LDADD) inclusion_inclu_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__inclusion_np_inclusion_SOURCES_DIST = \ ../src/inclusion/np_inclusion.cpp \ ../src/inclusion/inclusion.cpp @@ -235,7 +237,7 @@ inclusion_np_inclusion_LDADD = $(LDADD) inclusion_np_inclusion_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__sphere_edfb_sph_SOURCES_DIST = ../src/sphere/edfb_sph.f @BUILDFORTRAN_TRUE@am__objects_5 = ../src/sphere/edfb_sph.$(OBJEXT) @BUILDFORTRAN_TRUE@am_sphere_edfb_sph_OBJECTS = $(am__objects_5) @@ -244,7 +246,7 @@ sphere_edfb_sph_LDADD = $(LDADD) sphere_edfb_sph_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__sphere_np_sphere_SOURCES_DIST = ../src/sphere/np_sphere.cpp \ ../src/sphere/sphere.cpp @BUILDFORTRAN_FALSE@am_sphere_np_sphere_OBJECTS = \ @@ -258,7 +260,7 @@ sphere_np_sphere_LDADD = $(LDADD) sphere_np_sphere_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__sphere_sph_SOURCES_DIST = ../src/sphere/sph.f @BUILDFORTRAN_TRUE@am__objects_6 = ../src/sphere/sph.$(OBJEXT) @BUILDFORTRAN_TRUE@am_sphere_sph_OBJECTS = $(am__objects_6) @@ -266,7 +268,8 @@ sphere_sph_OBJECTS = $(am_sphere_sph_OBJECTS) sphere_sph_LDADD = $(LDADD) sphere_sph_DEPENDENCIES = libnptm/libnptm.la $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_1) am__testing_test_ParticleDescriptor_SOURCES_DIST = \ ../src/testing/test_ParticleDescriptor.cpp @BUILDFORTRAN_FALSE@am_testing_test_ParticleDescriptor_OBJECTS = ../src/testing/test_ParticleDescriptor.$(OBJEXT) @@ -277,7 +280,7 @@ testing_test_ParticleDescriptor_LDADD = $(LDADD) testing_test_ParticleDescriptor_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__testing_test_TEDF_SOURCES_DIST = ../src/testing/test_TEDF.cpp @BUILDFORTRAN_FALSE@am_testing_test_TEDF_OBJECTS = \ @BUILDFORTRAN_FALSE@ ../src/testing/test_TEDF.$(OBJEXT) @@ -288,7 +291,7 @@ testing_test_TEDF_LDADD = $(LDADD) testing_test_TEDF_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__testing_test_TTMS_SOURCES_DIST = ../src/testing/test_TTMS.cpp @BUILDFORTRAN_FALSE@am_testing_test_TTMS_OBJECTS = \ @BUILDFORTRAN_FALSE@ ../src/testing/test_TTMS.$(OBJEXT) @@ -299,7 +302,7 @@ testing_test_TTMS_LDADD = $(LDADD) testing_test_TTMS_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) am__trapping_frfme_SOURCES_DIST = ../src/trapping/frfme.f @BUILDFORTRAN_TRUE@am__objects_7 = ../src/trapping/frfme.$(OBJEXT) @BUILDFORTRAN_TRUE@am_trapping_frfme_OBJECTS = $(am__objects_7) @@ -307,7 +310,8 @@ trapping_frfme_OBJECTS = $(am_trapping_frfme_OBJECTS) trapping_frfme_LDADD = $(LDADD) trapping_frfme_DEPENDENCIES = libnptm/libnptm.la $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_1) am__trapping_lffft_SOURCES_DIST = ../src/trapping/lffft.f @BUILDFORTRAN_TRUE@am__objects_8 = ../src/trapping/lffft.$(OBJEXT) @BUILDFORTRAN_TRUE@am_trapping_lffft_OBJECTS = $(am__objects_8) @@ -315,7 +319,8 @@ trapping_lffft_OBJECTS = $(am_trapping_lffft_OBJECTS) trapping_lffft_LDADD = $(LDADD) trapping_lffft_DEPENDENCIES = libnptm/libnptm.la $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ + $(am__DEPENDENCIES_1) am__trapping_np_trapping_SOURCES_DIST = \ ../src/trapping/np_trapping.cpp ../src/trapping/cfrfme.cpp \ ../src/trapping/clffft.cpp @@ -332,7 +337,7 @@ trapping_np_trapping_LDADD = $(LDADD) trapping_np_trapping_DEPENDENCIES = libnptm/libnptm.la \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_1) + $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) AM_V_P = $(am__v_P_@AM_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) am__v_P_0 = false @@ -358,6 +363,7 @@ am__depfiles_remade = ../src/cluster/$(DEPDIR)/cluster.Po \ ../src/libnptm/$(DEPDIR)/TransitionMatrix.Plo \ ../src/libnptm/$(DEPDIR)/algebraic.Plo \ ../src/libnptm/$(DEPDIR)/clu_subs.Plo \ + ../src/libnptm/$(DEPDIR)/cublas_calls.Plo \ ../src/libnptm/$(DEPDIR)/file_io.Plo \ ../src/libnptm/$(DEPDIR)/inclu_subs.Plo \ ../src/libnptm/$(DEPDIR)/lapack_calls.Plo \ @@ -522,6 +528,8 @@ CLANGFLAGS = @CLANGFLAGS@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ +CUBLASFLAGS = @CUBLASFLAGS@ +CUBLASLDFLAGS = @CUBLASLDFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ @@ -650,9 +658,9 @@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ -LDADD = libnptm/libnptm.la -L/usr/lib64 ${USER_LDFLAGS} ${HDF5_LDFLAGS} ${LAPACKLDFLAGS} ${BLASLDFLAGS} ${MAGMALDFLAGS} +LDADD = libnptm/libnptm.la -L/usr/lib64 ${USER_LDFLAGS} ${HDF5_LDFLAGS} ${LAPACKLDFLAGS} ${BLASLDFLAGS} ${CUBLASLDFLAGS} ${MAGMALDFLAGS} lib_LTLIBRARIES = libnptm/libnptm.la -libnptm_libnptm_la_SOURCES = ../src/libnptm/algebraic.cpp ../src/libnptm/clu_subs.cpp ../src/libnptm/Commons.cpp ../src/libnptm/Configuration.cpp ../src/libnptm/file_io.cpp ../src/libnptm/inclu_subs.cpp ../src/libnptm/lapack_calls.cpp ../src/libnptm/logging.cpp ../src/libnptm/magma_calls.cpp ../src/libnptm/Parsers.cpp ../src/libnptm/sph_subs.cpp ../src/libnptm/utils.cpp ../src/libnptm/tfrfme.cpp ../src/libnptm/TransitionMatrix.cpp ../src/libnptm/tra_subs.cpp +libnptm_libnptm_la_SOURCES = ../src/libnptm/algebraic.cpp ../src/libnptm/clu_subs.cpp ../src/libnptm/Commons.cpp ../src/libnptm/Configuration.cpp ../src/libnptm/file_io.cpp ../src/libnptm/inclu_subs.cpp ../src/libnptm/lapack_calls.cpp ../src/libnptm/logging.cpp ../src/libnptm/magma_calls.cpp ../src/libnptm/cublas_calls.cpp ../src/libnptm/Parsers.cpp ../src/libnptm/sph_subs.cpp ../src/libnptm/utils.cpp ../src/libnptm/tfrfme.cpp ../src/libnptm/TransitionMatrix.cpp ../src/libnptm/tra_subs.cpp @BUILDFORTRAN_FALSE@PROGS = cluster/np_cluster inclusion/np_inclusion sphere/np_sphere trapping/np_trapping testing/test_ParticleDescriptor testing/test_TEDF testing/test_TTMS @BUILDFORTRAN_TRUE@PROGS = cluster/edfb_clu cluster/clu cluster/np_cluster inclusion/edfb_inclu inclusion/inclu inclusion/np_inclusion sphere/edfb_sph sphere/sph sphere/np_sphere trapping/frfme trapping/lffft trapping/np_trapping testing/test_ParticleDescriptor testing/test_TEDF testing/test_TTMS @BUILDFORTRAN_TRUE@EDFBCLUSOURCES = ../src/cluster/edfb_clu.f @@ -830,6 +838,8 @@ clean-libLTLIBRARIES: ../src/libnptm/$(DEPDIR)/$(am__dirstamp) ../src/libnptm/magma_calls.lo: ../src/libnptm/$(am__dirstamp) \ ../src/libnptm/$(DEPDIR)/$(am__dirstamp) +../src/libnptm/cublas_calls.lo: ../src/libnptm/$(am__dirstamp) \ + ../src/libnptm/$(DEPDIR)/$(am__dirstamp) ../src/libnptm/Parsers.lo: ../src/libnptm/$(am__dirstamp) \ ../src/libnptm/$(DEPDIR)/$(am__dirstamp) ../src/libnptm/sph_subs.lo: ../src/libnptm/$(am__dirstamp) \ @@ -1022,6 +1032,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/TransitionMatrix.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/algebraic.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/clu_subs.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/cublas_calls.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/file_io.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/inclu_subs.Plo@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@../src/libnptm/$(DEPDIR)/lapack_calls.Plo@am__quote@ # am--include-marker @@ -1399,6 +1410,7 @@ distclean: distclean-am -rm -f ../src/libnptm/$(DEPDIR)/TransitionMatrix.Plo -rm -f ../src/libnptm/$(DEPDIR)/algebraic.Plo -rm -f ../src/libnptm/$(DEPDIR)/clu_subs.Plo + -rm -f ../src/libnptm/$(DEPDIR)/cublas_calls.Plo -rm -f ../src/libnptm/$(DEPDIR)/file_io.Plo -rm -f ../src/libnptm/$(DEPDIR)/inclu_subs.Plo -rm -f ../src/libnptm/$(DEPDIR)/lapack_calls.Plo @@ -1473,6 +1485,7 @@ maintainer-clean: maintainer-clean-am -rm -f ../src/libnptm/$(DEPDIR)/TransitionMatrix.Plo -rm -f ../src/libnptm/$(DEPDIR)/algebraic.Plo -rm -f ../src/libnptm/$(DEPDIR)/clu_subs.Plo + -rm -f ../src/libnptm/$(DEPDIR)/cublas_calls.Plo -rm -f ../src/libnptm/$(DEPDIR)/file_io.Plo -rm -f ../src/libnptm/$(DEPDIR)/inclu_subs.Plo -rm -f ../src/libnptm/$(DEPDIR)/lapack_calls.Plo diff --git a/build/build_aux/config.guess b/build/build_aux/config.guess index cdfc4392047ce3843a7a98f5451bbe97cb8200ea..7f76b6228f73d674f58cfcc3523f99e253ee5515 100755 --- a/build/build_aux/config.guess +++ b/build/build_aux/config.guess @@ -1,10 +1,10 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright 1992-2023 Free Software Foundation, Inc. +# Copyright 1992-2022 Free Software Foundation, Inc. # shellcheck disable=SC2006,SC2268 # see below for rationale -timestamp='2023-08-22' +timestamp='2022-01-09' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -47,7 +47,7 @@ me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] -Output the configuration name of the system '$me' is run on. +Output the configuration name of the system \`$me' is run on. Options: -h, --help print this help, then exit @@ -60,13 +60,13 @@ version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright 1992-2023 Free Software Foundation, Inc. +Copyright 1992-2022 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" -Try '$me --help' for more information." +Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do @@ -102,8 +102,8 @@ GUESS= # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. -# Historically, 'CC_FOR_BUILD' used to be named 'HOST_CC'. We still -# use 'HOST_CC' if defined, but it is deprecated. +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. @@ -155,9 +155,6 @@ Linux|GNU|GNU/*) set_cc_for_build cat <<-EOF > "$dummy.c" - #if defined(__ANDROID__) - LIBC=android - #else #include #if defined(__UCLIBC__) LIBC=uclibc @@ -172,7 +169,6 @@ Linux|GNU|GNU/*) LIBC=musl #endif #endif - #endif EOF cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` eval "$cc_set_libc" @@ -463,7 +459,7 @@ case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in UNAME_RELEASE=`uname -v` ;; esac - # Japanese Language versions have a version number like '4.1.3-JL'. + # Japanese Language versions have a version number like `4.1.3-JL'. SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'` GUESS=sparc-sun-sunos$SUN_REL ;; @@ -908,7 +904,7 @@ EOF fi ;; *:FreeBSD:*:*) - UNAME_PROCESSOR=`uname -p` + UNAME_PROCESSOR=`/usr/bin/uname -p` case $UNAME_PROCESSOR in amd64) UNAME_PROCESSOR=x86_64 ;; @@ -970,37 +966,11 @@ EOF GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC ;; - x86_64:[Mm]anagarm:*:*|i?86:[Mm]anagarm:*:*) - GUESS="$UNAME_MACHINE-pc-managarm-mlibc" - ;; - *:[Mm]anagarm:*:*) - GUESS="$UNAME_MACHINE-unknown-managarm-mlibc" - ;; *:Minix:*:*) GUESS=$UNAME_MACHINE-unknown-minix ;; aarch64:Linux:*:*) - set_cc_for_build - CPU=$UNAME_MACHINE - LIBCABI=$LIBC - if test "$CC_FOR_BUILD" != no_compiler_found; then - ABI=64 - sed 's/^ //' << EOF > "$dummy.c" - #ifdef __ARM_EABI__ - #ifdef __ARM_PCS_VFP - ABI=eabihf - #else - ABI=eabi - #endif - #endif -EOF - cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` - eval "$cc_set_abi" - case $ABI in - eabi | eabihf) CPU=armv8l; LIBCABI=$LIBC$ABI ;; - esac - fi - GUESS=$CPU-unknown-linux-$LIBCABI + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be @@ -1066,16 +1036,7 @@ EOF k1om:Linux:*:*) GUESS=$UNAME_MACHINE-unknown-linux-$LIBC ;; - kvx:Linux:*:*) - GUESS=$UNAME_MACHINE-unknown-linux-$LIBC - ;; - kvx:cos:*:*) - GUESS=$UNAME_MACHINE-unknown-cos - ;; - kvx:mbr:*:*) - GUESS=$UNAME_MACHINE-unknown-mbr - ;; - loongarch32:Linux:*:* | loongarch64:Linux:*:*) + loongarch32:Linux:*:* | loongarch64:Linux:*:* | loongarchx32:Linux:*:*) GUESS=$UNAME_MACHINE-unknown-linux-$LIBC ;; m32r*:Linux:*:*) @@ -1190,27 +1151,16 @@ EOF ;; x86_64:Linux:*:*) set_cc_for_build - CPU=$UNAME_MACHINE LIBCABI=$LIBC if test "$CC_FOR_BUILD" != no_compiler_found; then - ABI=64 - sed 's/^ //' << EOF > "$dummy.c" - #ifdef __i386__ - ABI=x86 - #else - #ifdef __ILP32__ - ABI=x32 - #endif - #endif -EOF - cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` - eval "$cc_set_abi" - case $ABI in - x86) CPU=i686 ;; - x32) LIBCABI=${LIBC}x32 ;; - esac + if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_X32 >/dev/null + then + LIBCABI=${LIBC}x32 + fi fi - GUESS=$CPU-pc-linux-$LIBCABI + GUESS=$UNAME_MACHINE-pc-linux-$LIBCABI ;; xtensa*:Linux:*:*) GUESS=$UNAME_MACHINE-unknown-linux-$LIBC @@ -1230,7 +1180,7 @@ EOF GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION ;; i*86:OS/2:*:*) - # If we were able to find 'uname', then EMX Unix compatibility + # If we were able to find `uname', then EMX Unix compatibility # is probably installed. GUESS=$UNAME_MACHINE-pc-os2-emx ;; @@ -1371,7 +1321,7 @@ EOF GUESS=ns32k-sni-sysv fi ;; - PENTIUM:*:4.0*:*) # Unisys 'ClearPath HMP IX 4000' SVR4/MP effort + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says GUESS=i586-unisys-sysv4 ;; @@ -1417,11 +1367,8 @@ EOF BePC:Haiku:*:*) # Haiku running on Intel PC compatible. GUESS=i586-pc-haiku ;; - ppc:Haiku:*:*) # Haiku running on Apple PowerPC - GUESS=powerpc-apple-haiku - ;; - *:Haiku:*:*) # Haiku modern gcc (not bound by BeOS compat) - GUESS=$UNAME_MACHINE-unknown-haiku + x86_64:Haiku:*:*) + GUESS=x86_64-unknown-haiku ;; SX-4:SUPER-UX:*:*) GUESS=sx4-nec-superux$UNAME_RELEASE diff --git a/build/build_aux/config.sub b/build/build_aux/config.sub index defe52c0c874baa521e591c2b520f15de8a5f024..dba16e84c77c7d25871d80c24deff717faf4c094 100755 --- a/build/build_aux/config.sub +++ b/build/build_aux/config.sub @@ -1,10 +1,10 @@ #! /bin/sh # Configuration validation subroutine script. -# Copyright 1992-2023 Free Software Foundation, Inc. +# Copyright 1992-2022 Free Software Foundation, Inc. # shellcheck disable=SC2006,SC2268 # see below for rationale -timestamp='2023-09-19' +timestamp='2022-01-03' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -76,13 +76,13 @@ Report bugs and patches to ." version="\ GNU config.sub ($timestamp) -Copyright 1992-2023 Free Software Foundation, Inc. +Copyright 1992-2022 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" -Try '$me --help' for more information." +Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do @@ -130,7 +130,7 @@ IFS=$saved_IFS # Separate into logical components for further validation case $1 in *-*-*-*-*) - echo "Invalid configuration '$1': more than four components" >&2 + echo Invalid configuration \`"$1"\': more than four components >&2 exit 1 ;; *-*-*-*) @@ -145,8 +145,7 @@ case $1 in nto-qnx* | linux-* | uclinux-uclibc* \ | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* \ | netbsd*-eabi* | kopensolaris*-gnu* | cloudabi*-eabi* \ - | storm-chaos* | os2-emx* | rtmk-nova* | managarm-* \ - | windows-* ) + | storm-chaos* | os2-emx* | rtmk-nova*) basic_machine=$field1 basic_os=$maybe_os ;; @@ -944,7 +943,7 @@ $basic_machine EOF IFS=$saved_IFS ;; - # We use 'pc' rather than 'unknown' + # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) @@ -1076,7 +1075,7 @@ case $cpu-$vendor in pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) cpu=i586 ;; - pentiumpro-* | p6-* | 6x86-* | athlon-* | athlon_*-*) + pentiumpro-* | p6-* | 6x86-* | athlon-* | athalon_*-*) cpu=i686 ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) @@ -1181,7 +1180,7 @@ case $cpu-$vendor in case $cpu in 1750a | 580 \ | a29k \ - | aarch64 | aarch64_be | aarch64c | arm64ec \ + | aarch64 | aarch64_be \ | abacus \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] \ @@ -1200,23 +1199,45 @@ case $cpu-$vendor in | d10v | d30v | dlx | dsp16xx \ | e2k | elxsi | epiphany \ | f30[01] | f700 | fido | fr30 | frv | ft32 | fx80 \ - | javascript \ | h8300 | h8500 \ | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i*86 | i860 | i960 | ia16 | ia64 \ | ip2k | iq2000 \ | k1om \ - | kvx \ | le32 | le64 \ | lm32 \ - | loongarch32 | loongarch64 \ + | loongarch32 | loongarch64 | loongarchx32 \ | m32c | m32r | m32rle \ | m5200 | m68000 | m680[012346]0 | m68360 | m683?2 | m68k \ | m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x \ | m88110 | m88k | maxq | mb | mcore | mep | metag \ | microblaze | microblazeel \ - | mips* \ + | mips | mipsbe | mipseb | mipsel | mipsle \ + | mips16 \ + | mips64 | mips64eb | mips64el \ + | mips64octeon | mips64octeonel \ + | mips64orion | mips64orionel \ + | mips64r5900 | mips64r5900el \ + | mips64vr | mips64vrel \ + | mips64vr4100 | mips64vr4100el \ + | mips64vr4300 | mips64vr4300el \ + | mips64vr5000 | mips64vr5000el \ + | mips64vr5900 | mips64vr5900el \ + | mipsisa32 | mipsisa32el \ + | mipsisa32r2 | mipsisa32r2el \ + | mipsisa32r3 | mipsisa32r3el \ + | mipsisa32r5 | mipsisa32r5el \ + | mipsisa32r6 | mipsisa32r6el \ + | mipsisa64 | mipsisa64el \ + | mipsisa64r2 | mipsisa64r2el \ + | mipsisa64r3 | mipsisa64r3el \ + | mipsisa64r5 | mipsisa64r5el \ + | mipsisa64r6 | mipsisa64r6el \ + | mipsisa64sb1 | mipsisa64sb1el \ + | mipsisa64sr71k | mipsisa64sr71kel \ + | mipsr5900 | mipsr5900el \ + | mipstx39 | mipstx39el \ | mmix \ | mn10200 | mn10300 \ | moxie \ @@ -1264,7 +1285,7 @@ case $cpu-$vendor in ;; *) - echo "Invalid configuration '$1': machine '$cpu-$vendor' not recognized" 1>&2 + echo Invalid configuration \`"$1"\': machine \`"$cpu-$vendor"\' not recognized 1>&2 exit 1 ;; esac @@ -1285,12 +1306,11 @@ esac # Decode manufacturer-specific aliases for certain operating systems. -if test x"$basic_os" != x +if test x$basic_os != x then # First recognize some ad-hoc cases, or perhaps split kernel-os, or else just # set os. -obj= case $basic_os in gnu/linux*) kernel=linux @@ -1321,10 +1341,6 @@ EOF kernel=linux os=`echo "$basic_os" | sed -e 's|linux|gnu|'` ;; - managarm*) - kernel=managarm - os=`echo "$basic_os" | sed -e 's|managarm|mlibc|'` - ;; *) kernel= os=$basic_os @@ -1490,16 +1506,10 @@ case $os in os=eabi ;; *) - os= - obj=elf + os=elf ;; esac ;; - aout* | coff* | elf* | pe*) - # These are machine code file formats, not OSes - obj=$os - os= - ;; *) # No normalization, but not necessarily accepted, that comes below. ;; @@ -1518,15 +1528,12 @@ else # system, and we'll never get to this point. kernel= -obj= case $cpu-$vendor in score-*) - os= - obj=elf + os=elf ;; spu-*) - os= - obj=elf + os=elf ;; *-acorn) os=riscix1.2 @@ -1536,35 +1543,28 @@ case $cpu-$vendor in os=gnu ;; arm*-semi) - os= - obj=aout + os=aout ;; c4x-* | tic4x-*) - os= - obj=coff + os=coff ;; c8051-*) - os= - obj=elf + os=elf ;; clipper-intergraph) os=clix ;; hexagon-*) - os= - obj=elf + os=elf ;; tic54x-*) - os= - obj=coff + os=coff ;; tic55x-*) - os= - obj=coff + os=coff ;; tic6x-*) - os= - obj=coff + os=coff ;; # This must come before the *-dec entry. pdp10-*) @@ -1586,24 +1586,19 @@ case $cpu-$vendor in os=sunos3 ;; m68*-cisco) - os= - obj=aout + os=aout ;; mep-*) - os= - obj=elf + os=elf ;; mips*-cisco) - os= - obj=elf + os=elf ;; mips*-*) - os= - obj=elf + os=elf ;; or32-*) - os= - obj=coff + os=coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=sysv3 @@ -1612,8 +1607,7 @@ case $cpu-$vendor in os=sunos4.1.1 ;; pru-*) - os= - obj=elf + os=elf ;; *-be) os=beos @@ -1694,12 +1688,10 @@ case $cpu-$vendor in os=uxpv ;; *-rom68k) - os= - obj=coff + os=coff ;; *-*bug) - os= - obj=coff + os=coff ;; *-apple) os=macos @@ -1717,8 +1709,7 @@ esac fi -# Now, validate our (potentially fixed-up) individual pieces (OS, OBJ). - +# Now, validate our (potentially fixed-up) OS. case $os in # Sometimes we do "kernel-libc", so those need to count as OSes. musl* | newlib* | relibc* | uclibc*) @@ -1729,9 +1720,6 @@ case $os in # VxWorks passes extra cpu info in the 4th filed. simlinux | simwindows | spe) ;; - # See `case $cpu-$os` validation below - ghcjs) - ;; # Now accept the basic system types. # The portable systems comes first. # Each alternative MUST end in a * to match a version number. @@ -1740,7 +1728,7 @@ case $os in | hpux* | unos* | osf* | luna* | dgux* | auroraux* | solaris* \ | sym* | plan9* | psp* | sim* | xray* | os68k* | v88r* \ | hiux* | abug | nacl* | netware* | windows* \ - | os9* | macos* | osx* | ios* | tvos* | watchos* \ + | os9* | macos* | osx* | ios* \ | mpw* | magic* | mmixware* | mon960* | lnews* \ | amigaos* | amigados* | msdos* | newsos* | unicos* | aof* \ | aos* | aros* | cloudabi* | sortix* | twizzler* \ @@ -1749,11 +1737,11 @@ case $os in | mirbsd* | netbsd* | dicos* | openedition* | ose* \ | bitrig* | openbsd* | secbsd* | solidbsd* | libertybsd* | os108* \ | ekkobsd* | freebsd* | riscix* | lynxos* | os400* \ - | bosx* | nextstep* | cxux* | oabi* \ - | ptx* | ecoff* | winnt* | domain* | vsta* \ + | bosx* | nextstep* | cxux* | aout* | elf* | oabi* \ + | ptx* | coff* | ecoff* | winnt* | domain* | vsta* \ | udi* | lites* | ieee* | go32* | aux* | hcos* \ | chorusrdb* | cegcc* | glidix* | serenity* \ - | cygwin* | msys* | moss* | proelf* | rtems* \ + | cygwin* | msys* | pe* | moss* | proelf* | rtems* \ | midipix* | mingw32* | mingw64* | mint* \ | uxpv* | beos* | mpeix* | udk* | moxiebox* \ | interix* | uwin* | mks* | rhapsody* | darwin* \ @@ -1766,7 +1754,7 @@ case $os in | onefs* | tirtos* | phoenix* | fuchsia* | redox* | bme* \ | midnightbsd* | amdhsa* | unleashed* | emscripten* | wasi* \ | nsk* | powerunix* | genode* | zvmoe* | qnx* | emx* | zephyr* \ - | fiwix* | mlibc* | cos* | mbr* ) + | fiwix* ) ;; # This one is extra strict with allowed versions sco3.2v2 | sco3.2v[4-9]* | sco5v6*) @@ -1774,99 +1762,41 @@ case $os in ;; none) ;; - kernel* | msvc* ) - # Restricted further below - ;; - '') - if test x"$obj" = x - then - echo "Invalid configuration '$1': Blank OS only allowed with explicit machine code file format" 1>&2 - fi - ;; - *) - echo "Invalid configuration '$1': OS '$os' not recognized" 1>&2 - exit 1 - ;; -esac - -case $obj in - aout* | coff* | elf* | pe*) - ;; - '') - # empty is fine - ;; *) - echo "Invalid configuration '$1': Machine code format '$obj' not recognized" 1>&2 - exit 1 - ;; -esac - -# Here we handle the constraint that a (synthetic) cpu and os are -# valid only in combination with each other and nowhere else. -case $cpu-$os in - # The "javascript-unknown-ghcjs" triple is used by GHC; we - # accept it here in order to tolerate that, but reject any - # variations. - javascript-ghcjs) - ;; - javascript-* | *-ghcjs) - echo "Invalid configuration '$1': cpu '$cpu' is not valid with os '$os$obj'" 1>&2 + echo Invalid configuration \`"$1"\': OS \`"$os"\' not recognized 1>&2 exit 1 ;; esac # As a final step for OS-related things, validate the OS-kernel combination # (given a valid OS), if there is a kernel. -case $kernel-$os-$obj in - linux-gnu*- | linux-dietlibc*- | linux-android*- | linux-newlib*- \ - | linux-musl*- | linux-relibc*- | linux-uclibc*- | linux-mlibc*- ) - ;; - uclinux-uclibc*- ) +case $kernel-$os in + linux-gnu* | linux-dietlibc* | linux-android* | linux-newlib* \ + | linux-musl* | linux-relibc* | linux-uclibc* ) ;; - managarm-mlibc*- | managarm-kernel*- ) + uclinux-uclibc* ) ;; - windows*-msvc*-) - ;; - -dietlibc*- | -newlib*- | -musl*- | -relibc*- | -uclibc*- | -mlibc*- ) + -dietlibc* | -newlib* | -musl* | -relibc* | -uclibc* ) # These are just libc implementations, not actual OSes, and thus # require a kernel. - echo "Invalid configuration '$1': libc '$os' needs explicit kernel." 1>&2 - exit 1 - ;; - -kernel*- ) - echo "Invalid configuration '$1': '$os' needs explicit kernel." 1>&2 - exit 1 - ;; - *-kernel*- ) - echo "Invalid configuration '$1': '$kernel' does not support '$os'." 1>&2 + echo "Invalid configuration \`$1': libc \`$os' needs explicit kernel." 1>&2 exit 1 ;; - *-msvc*- ) - echo "Invalid configuration '$1': '$os' needs 'windows'." 1>&2 - exit 1 + kfreebsd*-gnu* | kopensolaris*-gnu*) ;; - kfreebsd*-gnu*- | kopensolaris*-gnu*-) + vxworks-simlinux | vxworks-simwindows | vxworks-spe) ;; - vxworks-simlinux- | vxworks-simwindows- | vxworks-spe-) - ;; - nto-qnx*-) - ;; - os2-emx-) + nto-qnx*) ;; - *-eabi*- | *-gnueabi*-) + os2-emx) ;; - none--*) - # None (no kernel, i.e. freestanding / bare metal), - # can be paired with an machine code file format + *-eabi* | *-gnueabi*) ;; - -*-) + -*) # Blank kernel with real OS is always fine. ;; - --*) - # Blank kernel and OS with real machine code file format is always fine. - ;; - *-*-*) - echo "Invalid configuration '$1': Kernel '$kernel' not known to work with OS '$os'." 1>&2 + *-*) + echo "Invalid configuration \`$1': Kernel \`$kernel' not known to work with OS \`$os'." 1>&2 exit 1 ;; esac @@ -1949,7 +1879,7 @@ case $vendor in ;; esac -echo "$cpu-$vendor${kernel:+-$kernel}${os:+-$os}${obj:+-$obj}" +echo "$cpu-$vendor-${kernel:+$kernel-}$os" exit # Local variables: diff --git a/build/build_aux/install-sh b/build/build_aux/install-sh index 7c56c9c015103600a06f59ab1183eb3966a513ab..ec298b53740270ce82b326c4c2deaa5dcdec4596 100755 --- a/build/build_aux/install-sh +++ b/build/build_aux/install-sh @@ -1,7 +1,7 @@ #!/bin/sh # install - install a program, script, or datafile -scriptversion=2023-11-23.18; # UTC +scriptversion=2020-11-14.01; # UTC # This originates from X11R5 (mit/util/scripts/install.sh), which was # later released in X11R6 (xc/config/util/install.sh) with the @@ -124,9 +124,9 @@ it's up to you to specify -f if you want it. If -S is not specified, no backups are attempted. -Report bugs to . -GNU Automake home page: . -General help using GNU software: ." +Email bug reports to bug-automake@gnu.org. +Automake home page: https://www.gnu.org/software/automake/ +" while test $# -ne 0; do case $1 in diff --git a/build/build_aux/ltmain.sh b/build/build_aux/ltmain.sh index 977e5237bb01a985aed489ff49b3d1d885cf75d5..4fdde9a05ebdb2d232252f41f233b4986659fffb 100755 --- a/build/build_aux/ltmain.sh +++ b/build/build_aux/ltmain.sh @@ -31,7 +31,7 @@ PROGRAM=libtool PACKAGE=libtool -VERSION="2.4.7 Debian-2.4.7-7build1" +VERSION="2.4.7 Debian-2.4.7-8" package_revision=2.4.7 @@ -2296,7 +2296,7 @@ include the following information: compiler: $LTCC compiler flags: $LTCFLAGS linker: $LD (gnu? $with_gnu_ld) - version: $progname $scriptversion Debian-2.4.7-7build1 + version: $progname $scriptversion Debian-2.4.7-8 automake: `($AUTOMAKE --version) 2>/dev/null |$SED 1q` autoconf: `($AUTOCONF --version) 2>/dev/null |$SED 1q` diff --git a/build/configure b/build/configure index a19d45e9152087491d48c71e630c9b9796356682..49a0060f5cdc115c1734b7cd43d37ab93b1e0649 100755 --- a/build/configure +++ b/build/configure @@ -661,6 +661,8 @@ USER_INCLUDE NVTXFLAGS MAGMALDFLAGS MAGMAFLAGS +CUBLASLDFLAGS +CUBLASFLAGS REFINEFLAGS BLASFLAGS BLASLDFLAGS @@ -825,6 +827,7 @@ enable_openmp enable_optimize with_lapack enable_refinement +with_cublas with_magma enable_nvtx with_include @@ -1508,6 +1511,7 @@ Optional Packages: --with-sysroot[=DIR] Search for dependent libraries within DIR (or the compiler's sysroot if not specified). --with-lapack use LAPACK [default=auto] + --with-cublas use CUBLAS [default=auto] --with-magma[=MAGMA_DIR] use MAGMA [default=auto] --with-include additional include folders [default=none] @@ -25233,6 +25237,14 @@ then : if test "x$result" = "x0"; then # BLAS was found export BLASLDFLAGS=$(pkg-config --libs blas${LAPACK_LDSPEC}) + else + declare -a pkg_array=$(pkg-config --list-all | grep openblas${LAPACK_LDSPEC}) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep openblas${LAPACK_LDSPEC} > /dev/null + result=$? + if test "x$result" = "x0"; then + # OPENBLAS was found + export BLASLDFLAGS=$(pkg-config --libs openblas${LAPACK_LDSPEC}) + fi fi # end of BLAS decision tree # search for LAPACKe declare -a pkg_array=$(pkg-config --list-all | grep lapacke${LAPACK_LDSPEC}) @@ -25335,6 +25347,14 @@ else case e in #( if test "x$result" = "x0"; then # BLAS was found export BLASLDFLAGS=$(pkg-config --libs blas${LAPACK_LDSPEC}) + else + declare -a pkg_array=$(pkg-config --list-all | grep openblas${LAPACK_LDSPEC}) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep openblas${LAPACK_LDSPEC} > /dev/null + result=$? + if test "x$result" = "x0"; then + # OPENBLAS was found + export BLASLDFLAGS=$(pkg-config --libs openblas${LAPACK_LDSPEC}) + fi fi # end of BLAS decision tree # search for LAPACKe declare -a pkg_array=$(pkg-config --list-all | grep lapacke${LAPACK_LDSPEC}) @@ -25427,6 +25447,169 @@ fi +# Check whether --with-cublas was given. +if test ${with_cublas+y} +then : + withval=$with_cublas; + if test "x$withval" = "xno"; then + CUBLASFLAGS="" + + CUBLASLDFLAGS="" + + else + + pkg-config --version > /dev/null + use_pkg_config=$? + if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then + if test "x$use_pkg_config" = "x0"; then + # pkg-config is available + declare -a pkg_array=$(pkg-config --list-all | grep cublas) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUBLAS detected + cuda_pkg=$(for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUBLAS runtime decision tree + declare -a pkg_array=$(pkg-config --list-all | grep cudart) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUDA runtime detected + cuda_pkg=$(for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUDA runtime decision tree + echo $CUDALDFLAGS | grep cublas > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcublas" + fi + echo $CUDALDFLAGS | grep cudart > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcudart" + fi + else + # pkg-config is not available + if test -f /usr/local/cuda/include/cuda.h; then + CUDAFLAGS="-I/usr/local/cuda/include" + CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart" + elif test -f /usr/include/cuda.h; then + CUDAFLAGS="-I/usr/include" + CUDALDFLAGS="-lcublas -lcudart" + elif test "x$CUDA_HOME" != "x"; then + CUDAFLAGS="-I${CUDA_HOME}/include" + CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart" + fi + fi # end of pkg-config decision tree + fi # end of CUDAFLAGS user override protection + if test "x $CUDAFLAGS $CUDALDFLAGS" != "x"; then + # somehow CUDAFLAGS or CUDALDFLAGS was defined + export CUDAFLAGS + export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}" + export CUDALDFLAGS + export CUBLASLDFLAGS="${CUDALDFLAGS}" + fi + + + if test "x$CUBLASLDFLAGS" != "x" +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: CUBLAS detected." >&5 +printf "%s\n" "$as_me: CUBLAS detected." >&6;} +else case e in #( + e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: CUBLAS not found." >&5 +printf "%s\n" "$as_me: CUBLAS not found." >&6;} + ;; +esac +fi + CUBLASFLAGS=${CUBLASFLAGS} + + CUBLASLDFLAGS=${CUBLASLDFLAGS} + + fi + +else case e in #( + e) + + pkg-config --version > /dev/null + use_pkg_config=$? + if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then + if test "x$use_pkg_config" = "x0"; then + # pkg-config is available + declare -a pkg_array=$(pkg-config --list-all | grep cublas) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUBLAS detected + cuda_pkg=$(for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUBLAS runtime decision tree + declare -a pkg_array=$(pkg-config --list-all | grep cudart) + for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUDA runtime detected + cuda_pkg=$(for i in "${pkg_array[@]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUDA runtime decision tree + echo $CUDALDFLAGS | grep cublas > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcublas" + fi + echo $CUDALDFLAGS | grep cudart > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcudart" + fi + else + # pkg-config is not available + if test -f /usr/local/cuda/include/cuda.h; then + CUDAFLAGS="-I/usr/local/cuda/include" + CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart" + elif test -f /usr/include/cuda.h; then + CUDAFLAGS="-I/usr/include" + CUDALDFLAGS="-lcublas -lcudart" + elif test "x$CUDA_HOME" != "x"; then + CUDAFLAGS="-I${CUDA_HOME}/include" + CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart" + fi + fi # end of pkg-config decision tree + fi # end of CUDAFLAGS user override protection + if test "x $CUDAFLAGS $CUDALDFLAGS" != "x"; then + # somehow CUDAFLAGS or CUDALDFLAGS was defined + export CUDAFLAGS + export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}" + export CUDALDFLAGS + export CUBLASLDFLAGS="${CUDALDFLAGS}" + fi + + + if test "x$CUBLASLDFLAGS" != "x" +then : + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: CUBLAS detected. Activating by default (use --without-cublas to disable)." >&5 +printf "%s\n" "$as_me: CUBLAS detected. Activating by default (use --without-cublas to disable)." >&6;} +else case e in #( + e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: CUBLAS not found." >&5 +printf "%s\n" "$as_me: CUBLAS not found." >&6;} + ;; +esac +fi + CUBLASFLAGS=${CUBLASFLAGS} + + CUBLASLDFLAGS=${CUBLASLDFLAGS} + + + ;; +esac +fi + + + # Check whether --with-magma was given. if test ${with_magma+y} then : @@ -25717,8 +25900,8 @@ else case e in #( ;; esac fi -CXXFLAGS="$CLANGFLAGS $OPTFLAGS -ggdb $DEBUGFLAGS $OFFLOADFLAGS $USER_INCLUDE -I$HDF5_INCLUDE $OMPFLAGS $MPIFLAGS $LAPACKFLAGS $MAGMAFLAGS $NVTXFLAGS $REFINEFLAGS" -SUBDIRS="cluster libnptm sphere testing trapping" +CXXFLAGS="$CLANGFLAGS $OPTFLAGS -ggdb $DEBUGFLAGS $OFFLOADFLAGS $USER_INCLUDE -I$HDF5_INCLUDE $OMPFLAGS $MPIFLAGS $LAPACKFLAGS $CUBLASFLAGS $MAGMAFLAGS $NVTXFLAGS $REFINEFLAGS" +SUBDIRS="cluster inclusion libnptm sphere testing trapping" # Generate the output cat >confcache <<\_ACEOF diff --git a/build/configure.ac b/build/configure.ac index b4b639535dc09a326723faadd1d183aa69803dcb..50c600be6126b86d88d24f7a79d0a646705de383 100644 --- a/build/configure.ac +++ b/build/configure.ac @@ -114,6 +114,14 @@ m4_define( if test "x$result" = "x0"; then # BLAS was found export BLASLDFLAGS=$(pkg-config --libs blas${LAPACK_LDSPEC}) + else + declare -a pkg_array=$(pkg-config --list-all | grep openblas${LAPACK_LDSPEC}) + for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep openblas${LAPACK_LDSPEC} > /dev/null + result=$? + if test "x$result" = "x0"; then + # OPENBLAS was found + export BLASLDFLAGS=$(pkg-config --libs openblas${LAPACK_LDSPEC}) + fi fi # end of BLAS decision tree # search for LAPACKe declare -a pkg_array=$(pkg-config --list-all | grep lapacke${LAPACK_LDSPEC}) @@ -155,6 +163,66 @@ m4_define( ] ) +m4_define( + [M4_DETECT_CUBLAS], + [ + pkg-config --version > /dev/null + use_pkg_config=$? + if test "x${CUDAFLAGS}${CUDALDFLAGS}" = "x"; then + if test "x$use_pkg_config" = "x0"; then + # pkg-config is available + declare -a pkg_array=$(pkg-config --list-all | grep cublas) + for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUBLAS detected + cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cublas) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUBLAS runtime decision tree + declare -a pkg_array=$(pkg-config --list-all | grep cudart) + for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart > /dev/null + result=$? + if test "x$result" = "x0"; then + # CUDA runtime detected + cuda_pkg=$(for i in "${pkg_array[[@]]}"; do echo "$i" | cut --delimiter=" " -f1; done | grep cudart) + CUDAFLAGS=$(pkg-config --cflags ${cuda_pkg}) + CUDALDFLAGS=$(pkg-config --libs ${cuda_pkg}) + fi # end of CUDA runtime decision tree + echo $CUDALDFLAGS | grep cublas > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcublas" + fi + echo $CUDALDFLAGS | grep cudart > /dev/null + cudart_check=$? + if test "x${cudart_check}" != "x0"; then + CUDALDFLAGS="$CUDALDFLAGS -lcudart" + fi + else + # pkg-config is not available + if test -f /usr/local/cuda/include/cuda.h; then + CUDAFLAGS="-I/usr/local/cuda/include" + CUDALDFLAGS="-L/usr/local/cuda/lib64 -lcublas -lcudart" + elif test -f /usr/include/cuda.h; then + CUDAFLAGS="-I/usr/include" + CUDALDFLAGS="-lcublas -lcudart" + elif test "x$CUDA_HOME" != "x"; then + CUDAFLAGS="-I${CUDA_HOME}/include" + CUDALDFLAGS="-L${CUDA_HOME}/lib64 -lcublas -lcudart" + fi + fi # end of pkg-config decision tree + fi # end of CUDAFLAGS user override protection + if test "x $CUDAFLAGS $CUDALDFLAGS" != "x"; then + # somehow CUDAFLAGS or CUDALDFLAGS was defined + export CUDAFLAGS + export CUBLASFLAGS="-DUSE_CUBLAS ${CUDAFLAGS}" + export CUDALDFLAGS + export CUBLASLDFLAGS="${CUDALDFLAGS}" + fi + ] +) + m4_define( [M4_DETECT_MAGMA], [ @@ -586,6 +654,36 @@ AC_ARG_ENABLE( ] ) +AC_ARG_WITH( + [cublas], + [AS_HELP_STRING([--with-cublas], [use CUBLAS @<:@default=auto@:>@])], + [ + if test "x$withval" = "xno"; then + AC_SUBST([CUBLASFLAGS], [""]) + AC_SUBST([CUBLASLDFLAGS], [""]) + else + M4_DETECT_CUBLAS + AS_IF( + [test "x$CUBLASLDFLAGS" != "x"], + [AC_MSG_NOTICE([CUBLAS detected.])], + [AC_MSG_NOTICE([CUBLAS not found.])] + ) + AC_SUBST([CUBLASFLAGS], [${CUBLASFLAGS}]) + AC_SUBST([CUBLASLDFLAGS], [${CUBLASLDFLAGS}]) + fi + ], + [ + M4_DETECT_CUBLAS + AS_IF( + [test "x$CUBLASLDFLAGS" != "x"], + [AC_MSG_NOTICE([CUBLAS detected. Activating by default (use --without-cublas to disable).])], + [AC_MSG_NOTICE([CUBLAS not found.])] + ) + AC_SUBST([CUBLASFLAGS], [${CUBLASFLAGS}]) + AC_SUBST([CUBLASLDFLAGS], [${CUBLASLDFLAGS}]) + ] +) + AC_ARG_WITH( [magma], [AS_HELP_STRING([--with-magma[[=MAGMA_DIR]]], [use MAGMA @<:@default=auto@:>@])], @@ -680,8 +778,8 @@ AS_IF( [AC_SUBST([OMPFLAGS], [""])], [AC_SUBST([OMPFLAGS], [$OMPFLAGS])] ) -CXXFLAGS="$CLANGFLAGS $OPTFLAGS -ggdb $DEBUGFLAGS $OFFLOADFLAGS $USER_INCLUDE -I$HDF5_INCLUDE $OMPFLAGS $MPIFLAGS $LAPACKFLAGS $MAGMAFLAGS $NVTXFLAGS $REFINEFLAGS" -SUBDIRS="cluster libnptm sphere testing trapping" +CXXFLAGS="$CLANGFLAGS $OPTFLAGS -ggdb $DEBUGFLAGS $OFFLOADFLAGS $USER_INCLUDE -I$HDF5_INCLUDE $OMPFLAGS $MPIFLAGS $LAPACKFLAGS $CUBLASFLAGS $MAGMAFLAGS $NVTXFLAGS $REFINEFLAGS" +SUBDIRS="cluster inclusion libnptm sphere testing trapping" # Generate the output AC_OUTPUT diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index 1b320af2e5c07ccfe03b09d620c706f59062add1..8950265a73e30572c62de3830114fdcf9e4a14d0 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -24,18 +24,32 @@ #include #include #include + #ifdef _OPENMP #include #endif + #ifdef USE_MPI #ifndef MPI_VERSION #include #endif #endif + #ifdef USE_NVTX #include #endif +//#define USE_CUBLAS 1 +#ifdef USE_CUBLAS +#include +#endif + +//#ifdef USE_MAGMA +//#include +//#endif +// define by hand for a first test +//#define USE_REFINEMENT 1 + #ifndef INCLUDE_TYPES_H_ #include "../include/types.h" #endif @@ -112,10 +126,10 @@ void cluster(const string& config_file, const string& data_file, const string& o Logger *logger = new Logger(LOG_DEBG); int device_count = 0; +#ifdef USE_MAGMA //=========== // Initialise MAGMA //=========== -#ifdef USE_MAGMA const magma_int_t d_array_max_size = 32; // TEMPORARY: can become configurable parameter magma_device_t *device_array = new magma_device_t[d_array_max_size]; magma_int_t num_devices; @@ -138,7 +152,11 @@ void cluster(const string& config_file, const string& data_file, const string& o delete logger; return; } -#endif // end MAGMA initialisation +// end MAGMA initialisation +#elif defined USE_CUBLAS + cudaGetDeviceCount(&device_count); + logger->log("DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " CUDA devices.\n", LOG_DEBG); +#endif //=========================== // the following only happens on MPI process 0 @@ -282,6 +300,8 @@ void cluster(const string& config_file, const string& data_file, const string& o string tppoan_name = output_path + "/c_TPPOAN"; #ifdef USE_MAGMA logger->log("INFO: using MAGMA calls.\n", LOG_INFO); +#elif defined USE_CUBLAS + logger->log("INFO: using CUBLAS calls.\n", LOG_INFO); #elif defined USE_LAPACK logger->log("INFO: using LAPACK calls.\n", LOG_INFO); #else @@ -786,6 +806,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf string outam0_name = output_path + "/c_AM0_JXI" + to_string(jxi488) + ".txt"; sprintf(virtual_line, " AM matrix before CMS\n"); outam0->append_line(virtual_line); + sprintf(virtual_line, " %d\n", ndit); + outam0->append_line(virtual_line); sprintf(virtual_line, " I1+1 I2+1 Real Imag\n"); outam0->append_line(virtual_line); write_dcomplex_matrix(outam0, cid->am, ndit, ndit); @@ -798,6 +820,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt"; sprintf(virtual_line, " AM matrix after CMS before LUCIN\n"); outam1->append_line(virtual_line); + sprintf(virtual_line, " %d\n", ndit); + outam1->append_line(virtual_line); sprintf(virtual_line, " I1+1 I2+1 Real Imag\n"); outam1->append_line(virtual_line); write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1); @@ -815,7 +839,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf #ifdef USE_NVTX nvtxRangePush("Invert the matrix"); #endif - // we the accuracygoal in, get the actual accuracy back out + // we put the accuracygoal in, get the actual accuracy back out double actualaccuracy = cid->accuracygoal; invert_matrix(cid->am, ndit, jer, cid->maxrefiters, actualaccuracy, cid->refinemode, mxndm, cid->proc_device); // in principle, we should check whether the returned actualaccuracy is indeed lower than the accuracygoal, and do something about it if not @@ -823,8 +847,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf if (cid->refinemode==2) { message = "INFO: calibration obtained accuracy " + to_string(actualaccuracy) + " (" + to_string(cid->accuracygoal) + " requested) in " + to_string(cid->maxrefiters) + " refinement iterations\n"; logger->log(message); - if (actualaccuracy > 1e-2) { - printf("Accuracy worse than 0.01, stopping"); + if (actualaccuracy > 1e-1) { + printf("Accuracy worse than 0.1, stopping"); exit(1); } } @@ -834,6 +858,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf string outam2_name = output_path + "/c_AM2_JXI" + to_string(jxi488) + ".txt"; sprintf(virtual_line, " AM matrix after LUCIN before ZTM\n"); outam2->append_line(virtual_line); + sprintf(virtual_line, " %d\n", ndit); + outam2->append_line(virtual_line); sprintf(virtual_line, " I1+1 I2+1 Real Imag\n"); outam2->append_line(virtual_line); write_dcomplex_matrix(outam2, cid->am, ndit, ndit); @@ -863,6 +889,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf string outam3_name = output_path + "/c_AM3_JXI" + to_string(jxi488) + ".txt"; sprintf(virtual_line, " AM matrix after ZTM\n"); outam3->append_line(virtual_line); + sprintf(virtual_line, " %d\n", ndit); + outam3->append_line(virtual_line); sprintf(virtual_line, " I1+1 I2+1 Real Imag\n"); outam3->append_line(virtual_line); write_dcomplex_matrix(outam3, cid->am, ndit, ndit); diff --git a/src/include/cublas_calls.h b/src/include/cublas_calls.h new file mode 100644 index 0000000000000000000000000000000000000000..e71b7558cd2b8720f6950187e6e5691e8c60cc76 --- /dev/null +++ b/src/include/cublas_calls.h @@ -0,0 +1,50 @@ +/* Copyright (C) 2024 INAF - Osservatorio Astronomico di Cagliari + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + A copy of the GNU General Public License is distributed along with + this program in the COPYING file. If not, see: . + */ + +/*! \file cublas_calls.h + * + * \brief C++ interface to CUBLAS calls. + * + */ + +#ifndef INCLUDE_CUBLAS_CALLS_H_ +#define INCLUDE_CUBLAS_CALLS_H_ + +/*! \brief Invert a complex matrix with double precision elements. + * + * Use CUBLAS to perform an in-place matrix inversion for a complex + * matrix with double precision elements. + * + * \param mat: Matrix of complex. The matrix to be inverted. + * \param n: `np_int` The number of rows and columns of the [n x n] matrix. + * \param device_id: `int` ID of the device for matrix inversion offloading. + */ +void cublas_zinvert(dcomplex **mat, np_int n, int device_id); + +/*! \brief Invert a complex matrix with double precision elements, applying iterative refinement of the solution + * + * Use CUBLAS to perform matrix inversion for a complex + * matrix with double precision elements. + * + * \param mat: Matrix of complex. The matrix to be inverted. + * \param n: `np_int` The number of rows and columns of the [n x n] matrix. + * \param maxrefiters: `int` Maximum number of refinement iterations to apply. + * \param accuracygoal: `double` Accuracy to achieve in iterative refinement, defined as the module of the maximum difference between the identity matrix and the matrix product of the (approximate) inverse times the original matrix. On return, it contains the actually achieved accuracy + * \param device_id: `int` ID of the device for matrix inversion offloading. + */ +void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxrefiters, double &accuracygoal, int refinemode, int device_id); + +#endif diff --git a/src/libnptm/algebraic.cpp b/src/libnptm/algebraic.cpp index 4fb5f117e8cad0287da4073e4768effc728c0a1a..1df8405018dc5f0dd92e477daed3513b69d15dd2 100644 --- a/src/libnptm/algebraic.cpp +++ b/src/libnptm/algebraic.cpp @@ -38,6 +38,16 @@ #endif #endif +// define by hand for a first test +//#define USE_CUBLAS 1 +#ifdef USE_CUBLAS +// define by hand for a first test +//#define USE_REFINEMENT 1 +#ifndef INCLUDE_CUBLAS_CALLS_H_ +#include "../include/cublas_calls.h" +#endif +#endif + #ifndef INCLUDE_ALGEBRAIC_H_ #include "../include/algebraic.h" #endif @@ -59,6 +69,12 @@ void invert_matrix(dcomplex **mat, np_int size, int &ier, int &maxrefiters, doub #else magma_zinvert(mat, size, ier, target_device); #endif +#elif defined USE_CUBLAS +#ifdef USE_REFINEMENT + cublas_zinvert_and_refine(mat, size, maxrefiters, accuracygoal, refinemode, target_device); +#else + cublas_zinvert(mat, size, target_device); +#endif #elif defined USE_LAPACK #ifdef USE_REFINEMENT zinvert_and_refine(mat, size, ier, maxrefiters, accuracygoal, refinemode); diff --git a/src/libnptm/cublas_calls.cpp b/src/libnptm/cublas_calls.cpp new file mode 100644 index 0000000000000000000000000000000000000000..caf483a61251d24e9991be7752f181c782c96dc0 --- /dev/null +++ b/src/libnptm/cublas_calls.cpp @@ -0,0 +1,232 @@ +/* Copyright (C) 2024 INAF - Osservatorio Astronomico di Cagliari + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + A copy of the GNU General Public License is distributed along with + this program in the COPYING file. If not, see: . + */ + +/*! \file cublas_calls.cpp + * + * \brief Implementation of the interface with CUBLAS libraries. + */ +#ifndef INCLUDE_TYPES_H_ +#include "../include/types.h" +#endif + +//#define USE_CUBLAS 1 +#ifdef USE_CUBLAS + +#ifndef INCLUDE_CUBLAS_CALLS_H_ +#include "../include/cublas_calls.h" +#endif + +#include +#include +#include + +#ifdef USE_ILP64 +#define CUZGEMM cublasZgemm_64 +#define CUZAXPY cublasZaxpy_64 +#define CUIZAMAX cublasIzamax_64 +#else +#define CUZGEMM cublasZgemm +#define CUZAXPY cublasZaxpy +#define CUIZAMAX cublasIzamax +#endif + +#define cudacall(call) \ + do \ + { \ + cudaError_t err = (call); \ + if(cudaSuccess != err) \ + { \ + fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + cudaDeviceReset(); \ + exit(EXIT_FAILURE); \ + } \ + } \ + while (0) + +#define cublascall(call) \ + do \ + { \ + cublasStatus_t status = (call); \ + if(CUBLAS_STATUS_SUCCESS != status) \ + { \ + fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \ + cudaDeviceReset(); \ + exit(EXIT_FAILURE); \ + } \ + \ + } \ + while(0) + +void cublas_zinvert(dcomplex **mat, np_int n, int device_id) { + cudacall(cudaSetDevice(device_id)); + cublasHandle_t handle; + cublascall(cublasCreate_v2(&handle)); + int batchsize = 1; + int *piv, *info; // array of pivot indices + np_int m = (np_int) n; // changed rows; a - mxm matrix + np_int mm = m * m; // size of a + cudacall(cudaMalloc(&piv, m*batchsize*sizeof(int))); + cudacall(cudaMalloc(&info, batchsize*sizeof(int))); + cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host + cuDoubleComplex *d_a; // pointer to first element on device + cudacall(cudaMalloc(&d_a,m*m*sizeof(cuDoubleComplex))); + cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); + cuDoubleComplex **batch_d_a; + cudacall(cudaMalloc(&batch_d_a,batchsize*sizeof(cuDoubleComplex*))); + cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice)); + cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize)); + cuDoubleComplex *d_c; // this will contain the inverted matrix on the device + cudacall(cudaMalloc(&d_c,m*m*sizeof(cuDoubleComplex))); + cuDoubleComplex **batch_d_c; + cudacall(cudaMalloc(&batch_d_c,batchsize*sizeof(cuDoubleComplex*))); + cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice)); + cublascall(cublasZgetriBatched(handle,n,batch_d_a,m,piv,batch_d_c,m,info,batchsize)); + cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost)); + cudaFree(batch_d_a); + cudaFree(batch_d_c); + cudaFree(piv); + cudaFree(info); + cudaFree(d_a); + cudaFree(d_c); + cublasDestroy_v2(handle); + +} + +void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &accuracygoal, int refinemode, int device_id) { + cudacall(cudaSetDevice(device_id)); + cublasHandle_t handle; + cublascall(cublasCreate_v2(&handle)); + int batchsize = 1; + int *piv, *info; // array of pivot indices + np_int m = (np_int) n; // changed rows; a - mxm matrix + np_int mm = m * m; // size of a + cudacall(cudaMalloc(&piv, m*batchsize*sizeof(int))); + cudacall(cudaMalloc(&info, batchsize*sizeof(int))); + cuDoubleComplex *a = (cuDoubleComplex *)&(mat[0][0]); // pointer to first element on host + cuDoubleComplex *d_a; // pointer to first element on device + cudacall(cudaMalloc(&d_a,m*m*sizeof(cuDoubleComplex))); + cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); + cuDoubleComplex **batch_d_a; + cudacall(cudaMalloc(&batch_d_a,batchsize*sizeof(cuDoubleComplex*))); + cudacall(cudaMemcpy(batch_d_a, &d_a, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice)); + cublascall(cublasZgetrfBatched(handle, m, batch_d_a, m, piv, info, batchsize)); + cuDoubleComplex *d_c; // this will contain the inverted matrix on the device + cudacall(cudaMalloc(&d_c,m*m*sizeof(cuDoubleComplex))); + cuDoubleComplex **batch_d_c; + cudacall(cudaMalloc(&batch_d_c,batchsize*sizeof(cuDoubleComplex*))); + cudacall(cudaMemcpy(batch_d_c, &d_c, batchsize*sizeof(cuDoubleComplex*), cudaMemcpyHostToDevice)); + cublascall(cublasZgetriBatched(handle,m,batch_d_a,m,piv,batch_d_c,m,info,batchsize)); + //cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost)); + cudaFree(batch_d_a); + cudaFree(batch_d_c); + cudaFree(piv); + cudaFree(info); + cuDoubleComplex *d_a_residual; + cuDoubleComplex *d_a_refine; + cuDoubleComplex *d_id; + if (maxiters>0) { + // copy the original matrix again to d_a, so I do not need to destroy the old d_a and recreate a new one + cudacall(cudaMemcpy(d_a, a, m*m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // from here on, d_a contains the original matrix, for refinement use + cudacall(cudaMalloc(&d_a_residual, m*m*sizeof(cuDoubleComplex))); + cudacall(cudaMalloc(&d_a_refine, m*m*sizeof(cuDoubleComplex))); + // allocate memory for the temporary matrix products + dcomplex *native_id = new dcomplex[1]; + native_id[0] = 1; + cuDoubleComplex *m_id = (cuDoubleComplex *) &(native_id[0]); + // fill it with 1 + cudacall(cudaMalloc(&d_id, sizeof(cuDoubleComplex))); + cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector + delete[] native_id; // free identity vector on host + } + bool iteraterefine = true; + if (maxiters>0) { + cuDoubleComplex cu_mone; + (((double *) &(cu_mone))[0]) = -1; + (((double *) &(cu_mone))[1]) = 0; + cuDoubleComplex cu_one; + (((double *) &(cu_one))[0]) = 1; + (((double *) &(cu_one))[1]) = 0; + cuDoubleComplex cu_zero; + (((double *) &(cu_zero))[0]) = 0; + (((double *) &(cu_zero))[1]) = 0; + // multiply minus the original matrix times the inverse matrix + // NOTE: factors in zgemm are swapped because zgemm is designed for column-major + // Fortran-style arrays, whereas our arrays are C-style row-major. + cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m)); + // add the identity to the product + cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1)); + double oldmax=0; + if (refinemode >0) { + np_int maxindex; + // find the maximum absolute value of the residual + cublascall(CUIZAMAX(handle, mm, d_a_residual, 1, &maxindex)); + cuDoubleComplex cublasmax; + // transfer the maximum value to the host + cudacall(cudaMemcpy(&cublasmax, d_a_residual+maxindex, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost)); + // take the module + oldmax = cabs( (((double *) &(cublasmax))[0]) + I*(((double *) &(cublasmax))[1])); + printf("Initial max residue = %g\n", oldmax); + if (oldmax < accuracygoal) iteraterefine = false; + } + // begin correction loop (should iterate maxiters times) + int iter; + for (iter=0; (iter oldmax)||(newmax < accuracygoal))) iteraterefine = false; + oldmax = newmax; + } + } + // if we are being called with refinemode=2, then on exit we set maxiters to the actual number of iters we performed to achieve the required accuracy + if (refinemode==2) maxiters = iter; + accuracygoal = oldmax; + // end correction loop + } + // copy the final refined inverted matrix back from device to host + cudacall(cudaMemcpy(a,d_c,m*m*sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost)); + // free temporary device arrays + cudaFree(d_a); + cudaFree(d_c); + if (maxiters>0) { + cudaFree(d_id); + cudaFree(d_a_refine); + cudaFree(d_a_residual); + } + + cublasDestroy_v2(handle); + +} + + +#endif diff --git a/src/libnptm/lapack_calls.cpp b/src/libnptm/lapack_calls.cpp index 676671207bd001b680ae8ff75de818342720ca29..0b9af94ed13be9f3d9cf408f887bae334738b49e 100644 --- a/src/libnptm/lapack_calls.cpp +++ b/src/libnptm/lapack_calls.cpp @@ -92,37 +92,36 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl #endif np_int nn = n*n; np_int incx = 1; + np_int incx0 = 0; #ifdef USE_MKL MKL_Complex16 *arr_orig = NULL; - MKL_Complex16 *arr_residual = NULL; - MKL_Complex16 *arr_refine = NULL; - MKL_Complex16 *id = NULL; + MKL_Complex16 *arr_residual = NULL; + MKL_Complex16 *arr_refine = NULL; + MKL_Complex16 *id = NULL; #else - dcomplex *arr_orig = NULL; - dcomplex *arr_residual = NULL; - dcomplex *arr_refine = NULL; - dcomplex *id = NULL; + dcomplex *arr_orig = NULL; + dcomplex *arr_residual = NULL; + dcomplex *arr_refine = NULL; + dcomplex *id = NULL; #endif if (maxiters>0) { #ifdef USE_MKL arr_orig = new MKL_Complex16[nn]; arr_residual = new MKL_Complex16[nn]; arr_refine = new MKL_Complex16[nn]; - id = new MKL_Complex16[n]; - for (np_int i=0; i0) { - np_int maxindex = izamax_(&n, arr_residual, &incx); + np_int maxindex = izamax_(&nn, arr_residual, &incx); #ifdef USE_MKL oldmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag); #else oldmax = cabs(arr_residual[maxindex]); #endif + printf("Initial max residue = %g\n", oldmax); if (oldmax < accuracygoal) iteraterefine = false; } int iter; @@ -170,14 +170,15 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx); // zcopy_(&nn, arr_refine, &incx, arr, &incx); zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n); - zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy); + zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy); if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) { - np_int maxindex = izamax_(&n, arr_residual, &incx); + np_int maxindex = izamax_(&nn, arr_residual, &incx); #ifdef USE_MKL double newmax = cabs(arr_residual[maxindex].real + I*arr_residual[maxindex].imag); #else double newmax = cabs(arr_residual[maxindex]); #endif + printf("Max residue after %d iterations = %g\n", iter+1, newmax); if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false; oldmax = newmax; } diff --git a/src/libnptm/magma_calls.cpp b/src/libnptm/magma_calls.cpp index 74a292b1c94485db3394cb5fe50038bd6281f5d7..514af4e41153698ad6ad1210573afa3ab5e91b21 100644 --- a/src/libnptm/magma_calls.cpp +++ b/src/libnptm/magma_calls.cpp @@ -132,16 +132,16 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, } // allocate memory for the identity vector on the host { - dcomplex *native_id = new dcomplex[m]; - for (magma_int_t i=0; i0) { // find the maximum absolute value of the residual @@ -171,6 +171,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, magma_zgetvector(1, d_a_residual+maxindex, 1, &magmamax, 1, queue); // take the module oldmax = cabs(magmamax.x + I*magmamax.y); + printf("Initial max residue = %g\n", oldmax); if (oldmax < accuracygoal) iteraterefine = false; } // begin correction loop (should iterate maxiters times) @@ -183,7 +184,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, // multiply minus the original matrix times the new inverse matrix magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue); // add the identity to the product - magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue); + magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue); if ((refinemode==2) || ((refinemode==1) && (iter == (maxiters-1)))) { // find the maximum absolute value of the residual magma_int_t maxindex = magma_izamax(mm, d_a_residual, 1, queue); @@ -192,6 +193,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, magma_zgetvector(1, d_a_residual+maxindex, 1, &magmamax, 1, queue); // take the module double newmax = cabs(magmamax.x + I*magmamax.y); + printf("Max residue after %d iterations = %g\n", iter+1, newmax); // if the maximum in the residual decreased from the previous iteration, // update oldmax and go on, otherwise no point further iterating refinements if ((refinemode==2) && ((newmax > oldmax)||(newmax < accuracygoal))) iteraterefine = false;