////////////////////////////////////////////////////////////////////////////////////////////////
// - You are given 1024 x 1024 integer matrix M;
// - Each row is assigned to a thread-block;
// - Each thread is assigned a matrix element M[i][j];
// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible);
// - Exploit shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 12.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork
// - Run the code:
//   $ ./classwork.cu
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <cuda.h>

#define N                     512
#define SIZE                  (N * N) // matrix size
typedef int64_t MyData;               // do not change
#define BLOCKSIZE             N       // number of threads per block

// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

#if BLOCKSIZE != N
#error BLOCKSIZE must be equal to N
#endif

#define LOOP 100

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

__global__ void gpu_copy(const MyData *const __restrict__ A,
			       MyData *const __restrict__ B)
{
  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));

  if (globalID >= SIZE)
    return;

  B[globalID] = A[globalID];

  return;
}

__global__ void gpu_matrix(      MyData *const __restrict__ matrix,
			   const MyData *const __restrict__ buffer)
{
  // global thread ID
  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
  
  if (globalID >= SIZE)
    return;

  // matrix row
  const size_t row = (globalID / N);
  //matrix column
  const size_t col = (globalID % N);

  MyData myValue = buffer[(row * N) + col];

  if (col < (N - 1))
    myValue += buffer[(row * N) + col + 1];
  
  matrix[globalID] = myValue;
  
  return;
}

__global__ void gpu_matrix_shared(MyData *const __restrict__ matrix)
{  
  // thread-id within the thread-block
  const size_t localID  = threadIdx.x;
  // global-thread-id
  const size_t globalID = (threadIdx.x + (blockIdx.x * blockDim.x));

  if (globalID >= SIZE)
    return;

  // shared memory buffer
  __shared__ MyData buffer[BLOCKSIZE + 1];
  // shared memory initialization
  for (size_t i=localID ; i<(BLOCKSIZE + 1) ; i+=blockDim.x)
    buffer[i] = 0;
  
  const size_t row   = (globalID / N);
  const size_t col   = (globalID % N);
  const size_t index = ((row * N) + col);

  // load data into shared memory
  buffer[localID] = matrix[index];

  // block-thread synchronization
  __syncthreads();

  // perform the calculation
  const MyData value = (buffer[localID] + buffer[localID + 1]);

  // store data
  matrix[index] = value;
  
  return;
}

void GPU_matrix(MyData *const __restrict__ matrix,
		MyData *const __restrict__ buffer,
		const dim3                 nblocks,
		const dim3                 block)
{
  // gpu_copy: gpu_A ---> gpu_buffer
  gpu_copy<<< nblocks, block >>>(matrix, buffer);

  // perform the actual calculation M[i][j] += M[i][j+1]
  gpu_matrix<<< nblocks, block >>>(matrix, buffer);

  // device synchronization
  cudaDeviceSynchronize();
  
  return;
}

void GPU_matrix_shared(MyData *const __restrict__ matrix,
		       const dim3                 nblocks,
		       const dim3                 block)
{
  gpu_matrix_shared<<< nblocks, block >>>(matrix);
  
  // device synchronization
  cudaDeviceSynchronize();
  
  return;
}

void CPU_matrix(MyData *const matrix)
{
  for (size_t i=0 ; i<N ; i++)
    for (size_t j=0 ; j<N-1 ; j++)
      matrix[(i * N) + j] += matrix[(i * N) + j + 1];

  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix,
	   const char   *const __restrict__ msg)
{
  int flag;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

  if (!flag)
    printf("\n\t Result OK %s", msg);
  else
    printf("\n\t Result wrong %s", msg);
  
  return;
}

int main()
{
  double time;
  // host reference matrix A
  MyData *cpu_A = (MyData *)malloc(SIZE * sizeof(MyData));
  assert (cpu_A != NULL);
  for (size_t i=0 ; i<SIZE ; i++)
    cpu_A[i] = (lrand48() % SIZE);

  // device reference matrix A
  MyData *gpu_A = NULL;
  cudaMalloc((void **)&gpu_A, (SIZE * sizeof(MyData)));
  assert(gpu_A != NULL);
  cudaMemcpy(gpu_A, cpu_A, (SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);

  ////////////////////////// CPU MATRIX //////////////////////////////////////////
  // allocate a second matrix where to perfom the actual computation
  MyData *cpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData));
  assert(cpu_matrix != NULL);

  // perform the actual computation
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      memcpy(cpu_matrix, cpu_A, (SIZE * sizeof(*cpu_matrix)));
      
      const double start = wall_time();
      CPU_matrix(cpu_matrix);
      time += (wall_time() - start);
    }
  printf("\n\t CPU time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  ///////////////////////////// GPU MATRIX ///////////////////////////////////////
  // allocate a second matrix where to perform the actual computation
  MyData *gpu_matrix = NULL;
  cudaMalloc((void **)&gpu_matrix, (SIZE * sizeof(MyData)));

  // allocate a buffer matrix useful to copy the matrix before the computation
  // in order to avoid thread race-conditions
  MyData *gpu_buffer = NULL;
  cudaMalloc((void **)&gpu_buffer, (SIZE * sizeof(MyData)));

  const dim3 block   = {BLOCKSIZE, 1, 1};
  const dim3 nblocks = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
  
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix);
      cudaDeviceSynchronize();
      
      double start = wall_time();
      GPU_matrix(gpu_matrix, gpu_buffer, nblocks, block);
      time += (wall_time() - start);
    }

  // copy gpu_matrix into cpu_A
  cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);

  check(cpu_matrix, cpu_A, "\t global memory implementation --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

  ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
  // gpu_buffer is not longer required
  cudaFree(gpu_buffer);
  
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      // restore data into gpu_matrix
      gpu_copy<<< nblocks, block>>>(gpu_A, gpu_matrix);
      cudaDeviceSynchronize();

      double start = wall_time();
      GPU_matrix_shared(gpu_matrix, nblocks, block);
      time += (wall_time() - start);
    }

  // copy gpu_matrix into cpu_A
  cudaMemcpy(cpu_A, gpu_matrix, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
  check(cpu_matrix, cpu_A, "\t shared memory implementation --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

  // free CPU-GPU memory
  free(cpu_matrix);
  free(cpu_A);
  cudaFree(gpu_A);
  cudaFree(gpu_matrix);
  
  return EXIT_SUCCESS;
}
