//////////////////////////////////////////////////////////////////////////////////////////////////
// Memory coalescing is a technique which allows optimal usage of the global memory bandwidth.
// That is, when parallel threads running the same instruction access to consecutive locations
// in the global memory, the most favorable access pattern is achieved.
//
// Assigment : write a kernel to vary the degree of coalescing from 1 to 32 based on input 
//             argument
//
// Hint: The Degree of Coalescing (DoC) is the inverse (33 -) the number of memory transactions
//       required for a warp to execute an instruction.
// Example:
//          - buffer[threadIdx.x] has DoC of 32;
//          - buffer[rand()]      has DoC of 1 in the worst case.
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 12.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ coalescing.cu -o coalescing
// - Run the code:
//   $ ./coalescing
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <cuda.h>
#include <stdio.h>

typedef double MyData;
#define N      1024
#define WARP   32
#define BLOCK  1024

__global__ void GPUDoC(const MyData *const __restrict__ source,
		             MyData *const __restrict__ sink,
		       const size_t                     DoC)
{
  const size_t myID = (threadIdx.x + (blockIdx.x * blockDim.x));
  
  if (myID >= WARP)
    return;
  
  // load data using a given DoC
  const size_t stride = ((WARP + 1) - DoC);

  // stride 1 (DoC=32) means 32 threads accessing one block of contiguous 32 words, so one memory transaction;
  // stride 2 (DoC=31) means 32 threads accessing 32 words spread across a lenght of size 64, so 2 memory transactions;
  // stride 3 (DoC=30) means 32 threads accessing 32 words spread across a lenght of size 96, so 3 memory transactions;
  // stride 4 (DoC=29) means 32 threads accessing 32 words spread across a lenght of size 128, so 4 memory transactions;
  // ...
  // stride 32 (DoC=1) means 32 threads accessing 32 words spread across a lenght of size 1024, so 32 memory transactions. 
  
  sink[myID] = source[myID * stride];
  
  return;
}

void CPU_GPU_memory_copy(const cudaError error)
{
  if (error)
    {
      printf("\n\t cudaMemcpy fails! ... aborting ...");
      exit(EXIT_FAILURE);
    }
  
  return;
}

void GPU_alloc_check(const cudaError error)
{
  if (error)
    {
      printf("\n\t cudaMalloc fails! ... aborting ... \n");
      exit(EXIT_FAILURE);
    }
  
  return;
}

void GPU_DoC(const MyData *const points,
	     const size_t        DoC)
{
  //****************** GPU memory allocation ***************************************************//
  MyData *source=NULL, *sink=NULL;
  GPU_alloc_check(cudaMalloc((void **)&source, ((N + WARP) * sizeof(MyData))));
  sink = source + N;
  //********************************************************************************************//

  //***************** Host to GPU memory copy **************************************************//
  CPU_GPU_memory_copy(cudaMemcpy(source, points, (N * sizeof(MyData)),
				 cudaMemcpyHostToDevice));
  //********************************************************************************************//

  //***************** Kernel lunch *************************************************************//
  // kernel is run using WARP threads
  const size_t nblocks = ((WARP + BLOCK - 1) / BLOCK);

  printf("\n\t DoC: %ld", DoC);
  printf("\n\t GPU_DoC<<< %ld, %ld >>>(source, sink, %ld)\n\n", nblocks, BLOCK, DoC);
  
  GPUDoC<<< nblocks, BLOCK >>>(source, sink, DoC);

  // GPU synchronization
  cudaDeviceSynchronize();

  // Free GPU memory
  cudaFree(source);

  //********************************************************************************************//
  
  return;
}

int main(int argv, char *argc[])
{
  size_t DoC = 0;
  
  if (argv < 2)
    {
      printf("\n\t Usage: ./coalescing <degree of coalescing>\n\n");
      exit(EXIT_FAILURE);
    }
  else
    {
      DoC = atoi(argc[1]);

      if ((DoC < 1) || (DoC > 32))
	{
	  printf("\n\t 1 <= <degree of coalescing> <= 32 ... aborting ... \n\n");
	  exit(EXIT_FAILURE);
	}      
    }

  srand48(time(NULL));
  MyData *points = (MyData *)malloc(N * sizeof(*points));
  for (size_t i=0 ; i<N ; i++)
    points[i] = drand48();
  
  // lunch kernel
  GPU_DoC(points, DoC);

  free(points);
  
  return EXIT_SUCCESS;
}
