Skip to content
Snippets Groups Projects
Select Git revision
  • 7e6f37d51fdcd4c5130b3261b0de513af3d1d50a
  • main default protected
  • hpc_school
  • energy
  • openacc
  • mpi_bugfix
6 results

mat_mult.c

Blame
  • mat_mult.c 5.15 KiB
    ////////////////////////////////////////////////////////////////////////////////////////////////
    // - Naive matrix multiplication algorithm
    //   for (size_t i=0 ; i<N ; i++)
    //      for (size_t j=0 ; j<N ; j++)
    //         for (size_t k=0 ; k<_N ; k++)
    //            C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
    //
    ////////////////////////////////////////////////////////////////////////////////////////////////
    
    //////////////////////////////////////////////////////////////////////////////////////////////////
    // Author: David Goz
    // mail  : david.goz@inaf.it
    // date  : 31.07.2024
    // code tested using nvhpc
    //
    // - Compile the code:
    //   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
    // - Run the code:
    //   $ ./classwork_omp
    //////////////////////////////////////////////////////////////////////////////////////////////////
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    #include <time.h>
    #include <assert.h>
    #include <omp.h>
    #include <string.h>
    
    #define N                     512
    #define SIZE                  (N * N) // matrix size
    typedef double MyData;                // do not change
    
    #define LOOP 100
    #define NDEBUG
    
    double wall_time()
    {
      struct timespec ts;
      clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
      const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
    
      return ret;
    }
    
    void CPU_mat_mult(const MyData *const restrict A,
    		  const MyData *const restrict B,
    	                MyData *const restrict C,
    		  const size_t                 size)
    {
      for (size_t i=0 ; i<size ; i++)
        for (size_t j=0 ; j<size ; j++)
          for (size_t k=0 ; k<size ; k++)
    	C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
    
      return;
    }
    
    void GPU_mat_mult(const MyData *const restrict A,
    		  const MyData *const restrict B,
    	                MyData *const restrict C,
    		  const size_t                 size)
    {
      #pragma omp target
      {
        #pragma omp teams distribute num_teams(size)
        for (size_t i=0 ; i<size ; i++)
          {
            #pragma omp parallel for num_threads(size)
    	for (size_t j=0 ; j<size ; j++)
    	  {
    	    MyData value = (MyData)0;
    	    for (size_t k=0 ; k<size ; k++)
    	      value += (A[(i * size) + k] * B[(k * size) + j]);
    
    	    C[(i * size) + j] = value;
    	  } // omp thread
          } // omp teams
       } // omp target
    
      return;
    }
    
    void GPU_mat_mult_no_loops(const MyData *const restrict A,
    			   const MyData *const restrict B,
    			         MyData *const restrict C,
    			   const size_t                 size)
    {
     #pragma omp target
      {
       #pragma omp teams num_teams(size)
        {
          const size_t team_size = (size * omp_get_team_num());
          
         #pragma omp parallel firstprivate(team_size) num_threads(size)
          {
             const size_t tid = omp_get_thread_num();
    	 MyData value = (MyData)0;
    	 for (size_t k=0 ; k<size ; k++)
    	   value += (A[team_size + k] * B[(k * size) + tid]);
    
    	 C[team_size + tid] = value;
          } // omp threads      
        } // omp teams
      } // omp target
    
      return;
    }
    
    void check(const MyData *const __restrict__ cpu_matrix,
    	   const MyData *const __restrict__ gpu_matrix)
    {
      int flag;
      for (size_t i=0 ; i<SIZE ; i++)
        flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
    
      if (!flag)
        printf("\n\t Result OK");
      else
        printf("\n\t Result wrong");
      
      return;
    }
    
    int main()
    {
      double time;
      MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
      assert(buffer != NULL);
    
      // host reference matrix A
      MyData *const restrict A     = buffer;
      MyData *const restrict B     = A + SIZE;
      MyData *const restrict C_CPU = B + SIZE;
      MyData *const restrict C_GPU = C_CPU + SIZE;
      for (size_t i=0 ; i<SIZE ; i++)
        {
          A[i] = drand48();
          B[i] = drand48();
        }
    
      ////////////////////////// CPU naive algorithm //////////////////////////////////////////
      CPU_mat_mult(A, B, C_CPU, N);
      /////////////////////////////////////////////////////////////////////////////////////////
    
      // copy/alloc data to the GPU
      #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
    
      /////////////////////////// GPU naive algorithm ////////////////////////////////////////
      time = 0.0;
      for (unsigned short int loop=0 ; loop<LOOP ; loop++)
        {
          const double start = wall_time();
          GPU_mat_mult(A, B, C_GPU, N);
          time += (wall_time() - start);
        }
      
      #pragma omp target update from(C_GPU[0:SIZE])
      check(C_CPU, C_GPU);
      printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
      ////////////////////////////////////////////////////////////////////////////////
    
      /////////////////////////// GPU naive no loops algorithm ////////////////////////////
      time = 0.0;
      for (unsigned short int loop=0 ; loop<LOOP ; loop++)
        {
          const double start = wall_time();
          GPU_mat_mult_no_loops(A, B, C_GPU, N);
          time += (wall_time() - start);
        }
      
      #pragma omp target update from(C_GPU[0:SIZE])
      check(C_CPU, C_GPU);
      printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
      ////////////////////////////////////////////////////////////////////////////////
    
      // free CPU memory
      free(buffer);
      // free GPU memory
      #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
      
      printf("\n");
    
      return EXIT_SUCCESS;
    }