//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// int main()
// {
//   /* Matrix multiplication: C = A X B */
//
//   int A[N][N], B[N][N], C[N][N];
//
//   for (int ii=0 ; ii<N ; ii++)
//      for (int jj=0 ; jj<N ; jj++)
//         for (int kk=0 ; kk<N ; kk++)
//            C[ii][jj] += A[ii][kk] * B[kk][jj];
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 08.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork
// - Run the code:
//   $ ./classwork <N>
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>

#define BLOCKSIZE 32
#define SQUARE(SIZE) ((SIZE) * (SIZE))

typedef long int MyData;

void matrix_init(      MyData *const AB,
		 const MyData        N)
{
  MyData *const restrict A = AB;
  MyData *const restrict B = AB + SQUARE(N);
  
  for (MyData ii=0 ; ii<N ; ii++)
    for (MyData jj=0 ; jj<N ; jj++)
      {
	A[(ii * N) + jj] = (ii - jj);
	B[(ii * N) + jj] = 1;
      }
  
  return;
}

__global__ void GPU_MM(const MyData *const __restrict__ A,
		       const MyData *const __restrict__ B,
		             MyData *const __restrict__ C,
		       const MyData                     N)
{
  const MyData IDx = threadIdx.x + (blockIdx.x * blockDim.x);
  const MyData IDy = threadIdx.y + (blockIdx.y * blockDim.y);

  /* check out of boundaries */
  if ((IDx >= N) || (IDy >= N))
    return;

  /* each thread performs the calculation of one element */
  /* of the matrix, i.e. C[IDx][IDy]                     */
  
  MyData sum = 0;
  for (MyData kk=0 ; kk<N ; kk++)
    sum += (A[(IDx * N) + kk] * B[(kk * N) + IDy]);

  C[(IDx * N) + IDy] = sum;
  
  return;
}

void check(const MyData *const __restrict__ GPU_MM,
	   const MyData *const __restrict__ A,
	   const MyData                     N)
{
  int flag = 0;

  for (MyData ii=0 ; ii<N ; ii++)
    {
      MyData row_a = 0;      
      for (MyData kk=0 ; kk<N ; kk++)
	row_a += A[(ii * N) + kk];

      for (MyData jj=0 ; jj<N ; jj++)
	if (GPU_MM[(ii * N) + jj] != row_a)
	  {
	    flag = 1;
	    break;
	  }
    }

  if (flag)
    printf("\n\t Result WRONG \n\n");
  else
    printf("\n\t Result OK \n\n");
  
  return;
}

int main(int arg, char *argv[])
{
  int N;
  
  if (arg < 2)
    {
      printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
      exit(EXIT_FAILURE);
    }
  else
    {
      N = atoi(argv[1]);
      if (N <= 0)
	{
	  printf("\n\t Number of matrix elements must be greater than zero \n");
	  exit(EXIT_FAILURE);
	}
      else
	{
	  printf("\n\t Matrix size    : %d x %d", N, N);
	  printf("\n\t CUDA block size: %d x %d", BLOCKSIZE, BLOCKSIZE);
	}
    }

  /* host memory allocation */
  MyData *h_buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData));
  assert(h_buffer != NULL);
  /* set-up host pointers */
  MyData *const restrict h_A = h_buffer;
  MyData *const restrict h_B = h_A + SQUARE(N);
  MyData *const restrict h_C = h_B + SQUARE(N);

  /* device memory allocation */
  MyData *d_buffer = NULL;
  cudaMalloc((void **)&d_buffer, (3 * SQUARE(N) * sizeof(MyData)));
  assert(d_buffer != NULL);
  /* set-up device pointers */
  MyData *const restrict d_A = d_buffer;
  MyData *const restrict d_B = d_A + SQUARE(N);
  MyData *const restrict d_C = d_B + SQUARE(N);

  // matrix initialization
  matrix_init(h_buffer, N);

  /* copy host data to device */
  cudaMemcpy(d_A, h_A, (2 * SQUARE(N) * sizeof(MyData)), cudaMemcpyHostToDevice);
  
  // kernel lunch on GPU
  const size_t nblocks = ((N + BLOCKSIZE - 1) / BLOCKSIZE);
  const dim3 grid      = {nblocks, nblocks, 1};
  const dim3 block     = {BLOCKSIZE, BLOCKSIZE, 1};
  GPU_MM<<< grid, block >>>(d_A, d_B, d_C, N);

  /* host-device data synchronization         */
  /* N.B. cudaDeviceSynchronize() is not      */
  /* necessary since cudaMemcpy() is blocking */
  cudaMemcpy(h_C, d_C, (SQUARE(N) * sizeof(MyData)), cudaMemcpyDeviceToHost);

  // free the GPU memory
  cudaFree(d_buffer);

  // check the result
  check(h_C, h_A, N);

  free(h_buffer);
  
  return 0;
}
