//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// #define ROW 4
// #define COL 8
// int main()
// {
//   int Matrix[ROW * COL];
//
//   for (int row=0 ; row<ROW ; row++)
//      for (int col=0 ; col<COL ; col++)
//         Matrix[(row * COL) + col] = ((row * COL) + col);
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork
// - Run the code:
//   $ ./classwork
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <assert.h>

#define BLOCKSIZE 32
#define ROW 1089
#define COL 1111

typedef int MyData;

__global__ void GPUMatrix(MyData *Matrix,
			  const int size)
{
  /* global thread index */
  const int index = threadIdx.x + (blockIdx.x * blockDim.x);

  if (index < size)
    Matrix[index] = index;

  return;
}

int main()
{
  /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
  MyData *h_matrix = (MyData *)malloc(ROW * COL * sizeof(*h_matrix));
  assert(h_matrix != NULL);

  /* device matrix */
  MyData *d_matrix = NULL;
  cudaMalloc((void **)&d_matrix, ROW * COL * sizeof(*d_matrix));
  assert(d_matrix != NULL);
  
  /* 1D grid  */
  const dim3 grid = {(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE), // number of blocks along X
		     1,                                           // number of blocks along Y
		     1};                                          // number of blocks along Z
  const dim3 block = {BLOCKSIZE, // number of threads per block along X
		      1,         // number of threads per block along Y
		      1};        // number of threads per block along Z
  
  // kernel
  GPUMatrix<<<grid, block>>>(d_matrix, (ROW * COL));
  
  // device synchronization
  cudaDeviceSynchronize();

  /* fetch matrix from the device */
  cudaMemcpy(h_matrix, d_matrix, (ROW * COL * sizeof(*d_matrix)), cudaMemcpyDeviceToHost);

  /* free device memory */
  cudaFree(d_matrix);
  
  // check the result
  int flag = 0;
  for (size_t row=0 ; row<ROW ; row++)
    {
      const size_t i = (row * COL);
      
      for (size_t col=0 ; col<COL ; col++)
      {
	flag = ((h_matrix[i + col] != (i + col)) ? -1 : flag);
      } /* col loop */
    } /* row loop */

  // free host memory
  free(h_matrix);

  if (flag)
    printf("\n\t Result wrong! \n\n");
  else
    printf("\n\t Result OK! \n\n");
  
  return 0;
}
