////////////////////////////////////////////////////////////////////////////////////////////////
// $ The program tests the SoA vs AoS approaches on GPU
// In computing, an array of structures (AoS), structure of arrays (SoA) or
// array of structures of arrays (AoSoA) are contrasting ways to arrange a sequence
// of records in memory, with regard to interleaving, and are of interest in SIMD
// and SIMT programming.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 12.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ aos_soa.cu -o aos_soa
// - Run the code:
//   $ ./aos_soa
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <vector>
#include <unistd.h>
#include <time.h>
#include <cuda.h>

#define Byte_to_MB (1.0 / (1024 * 1024))
#define LOOP       100
#define BLOCKSIZE  1024
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

// datatypes
struct nodeAoS
{
  double   a;
  float    b;
  long int c;
  int      d;
  char     e;
};

struct nodeSoA
{
  double   *a;
  float    *b;
  long int *c;
  int      *d;
  char     *e;
};

void GPU_alloc_check(const cudaError error)
{
  using namespace std;
  
  if (error)
    {
      cout << "\t cudaMalloc fails! ... aborting ..." << endl;
      exit(EXIT_FAILURE);
    }
  
  return;
}

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

__global__ void gpu_AoS(struct nodeAoS *const allnodeAoS,
			const size_t          size)
{
  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);

  if (ID >= size)
    return;

  // warp non-coalescing access ---> stride = sizeof(struct nodeAoS)
  allnodeAoS[ID].a = (double)ID;
  allnodeAoS[ID].b = (float)ID;
  allnodeAoS[ID].c = (long int)ID;
  allnodeAoS[ID].d = (int)ID;
  allnodeAoS[ID].e = 'a';
  
  return;
}

__global__ void gpu_nodeAoS(struct nodeAoS *const allnodeAoS,
			    const size_t          size)
{
  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);

  const struct nodeAoS node = {(double)ID, (float)ID, (long int)ID, (int)ID, 'a'};

  if (ID >= size)
    return;

  // warp coalescing access ---> stride = 0
  allnodeAoS[ID] = node;

  return;
}

__global__ void gpu_SoA(double   *const __restrict__ a,
			float    *const __restrict__ b,
			long int *const __restrict__ c,
			int      *const __restrict__ d,
			char     *const __restrict__ e,
			const size_t                 size)
{
  const size_t ID = threadIdx.x + (blockIdx.x * blockDim.x);

  if (ID >= size)
    return;

  // warp coalescing access ---> stride = 0
  a[ID] = (double)ID;
  b[ID] = (float)ID;
  c[ID] = (long int)ID;
  d[ID] = (int)ID;
  e[ID] = 'a';
  
  return;
}

size_t get_gpu_free_memory()
{
  using namespace std;
  
  int num_gpus{0};
  // get the number of available gpus on the system
  cudaGetDeviceCount(&num_gpus);

  vector<size_t>free_memory(num_gpus);
  vector<size_t>total_memory(num_gpus);
  for (size_t gpu_id=0 ; gpu_id<num_gpus ; gpu_id++)
    {
      cudaSetDevice(gpu_id);
      cudaMemGetInfo(&free_memory[gpu_id], &total_memory[gpu_id]);
      cout << "\n\t GPU " << gpu_id << " - free memory: " << free_memory[gpu_id]*Byte_to_MB << "[MB] - total memory: " << total_memory[gpu_id]*Byte_to_MB << " [MB]" << endl;
    }

  // return the free memory of the first GPU
  return free_memory[0];
}
    
int main()
{
  using namespace std;

  // free gpu mem in MB
  const size_t gpu_free_mem = get_gpu_free_memory();
  // number of elements in the AoS
  const size_t N = (gpu_free_mem / 2 / sizeof(nodeAoS));
  
  double start_time{0};
  double end_time{0};

  const dim3 block{BLOCKSIZE, 1, 1};
  const dim3 nblocks{((N + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
  
  // allocate buffer for the GPU to store AoS and SoA
  nodeAoS *gpu_allnodeAoS{nullptr};
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeAoS, (N * sizeof(*gpu_allnodeAoS))));

  nodeSoA gpu_allnodeSoA = {nullptr, nullptr, nullptr, nullptr, nullptr};
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.a, (N * sizeof(*gpu_allnodeSoA.a))));
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.b, (N * sizeof(*gpu_allnodeSoA.b))));
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.c, (N * sizeof(*gpu_allnodeSoA.c))));
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.d, (N * sizeof(*gpu_allnodeSoA.d))));
  GPU_alloc_check(cudaMalloc((void **)&gpu_allnodeSoA.e, (N * sizeof(*gpu_allnodeSoA.e))));

  // gpu warm-up
  gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
  cudaDeviceSynchronize();

  ////////////////////////// GPU AoS /////////////////////////////////////////////
  start_time = wall_time();
  for (int loop=0 ; loop<LOOP ; loop++)
    gpu_AoS<<< nblocks, block >>>(gpu_allnodeAoS, N);

  cudaDeviceSynchronize();
  end_time = wall_time();
  cout << "\t GPU AoS     time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl;
  ////////////////////////////////////////////////////////////////////////////////

  // gpu warm-up
  gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N);
  cudaDeviceSynchronize();
  
  ////////////////////////// GPU nodeAoS /////////////////////////////////////////
  start_time = wall_time();
  for (int loop=0 ; loop<LOOP ; loop++)
      gpu_nodeAoS<<< nblocks, block >>>(gpu_allnodeAoS, N);

  cudaDeviceSynchronize();
  end_time = wall_time();
  cout << "\t GPU nodeAoS time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]" << endl;
  ////////////////////////////////////////////////////////////////////////////////

  // gpu warm-up
  gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a,
				gpu_allnodeSoA.b,
				gpu_allnodeSoA.c,
				gpu_allnodeSoA.d,
				gpu_allnodeSoA.e,
				N);
  cudaDeviceSynchronize();
  
  ////////////////////////// GPU SoA /////////////////////////////////////////////
  start_time = wall_time();
  for (int loop=0 ; loop<LOOP ; loop++)
    gpu_SoA<<< nblocks, block >>>(gpu_allnodeSoA.a,
				  gpu_allnodeSoA.b,
				  gpu_allnodeSoA.c,
				  gpu_allnodeSoA.d,
				  gpu_allnodeSoA.e,
				  N);

  cudaDeviceSynchronize();
  end_time = wall_time();
  cout << "\t GPU SoA     time: " << ((end_time - start_time) / static_cast<double>(LOOP)) << " [s]\n" << endl;
  ////////////////////////////////////////////////////////////////////////////////
  
  // free GPU memory
  cudaFree(gpu_allnodeAoS);
  cudaFree(gpu_allnodeSoA.a);
  cudaFree(gpu_allnodeSoA.b);
  cudaFree(gpu_allnodeSoA.c);
  cudaFree(gpu_allnodeSoA.d);
  cudaFree(gpu_allnodeSoA.e);
  
  return EXIT_SUCCESS;
}
