////////////////////////////////////////////////////////////////////////////////////////////////
// $ The program shows how to port a linked list, created by the CPU, onto the GPU
//
// nvcc compiler might issue the warning: Stack size for entry function … cannot be statically determined).
// The reason is the recursion function printLinkedList.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 12.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ linked_list.cu -o linked_list
// - Run the code:
//   $ ./linked_list
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <iostream>
#include <unistd.h>
#include <time.h>
#include <cuda.h>
#include <new>
#include <cassert>
#include <vector>

#define N          10 // linked list size
#define BLOCKSIZE  1024
#define CPU        0
#define GPU        1
#if (BLOCKSIZE > 1024)
#error BLOCKSIZE cannot be larger than 1024
#endif

// datatypes
struct node
{
  struct node *next;
  int data;
};

typedef struct node   node;
typedef struct node * nodeptr;

// container where to store node pointers on the GPU
static std::vector<nodeptr> GPUptr;

void GPU_check(const cudaError error,
	       const char      *cudaFunc)
{
  using namespace std;
  
  if (error)
    {
      cout << "\t " << cudaFunc << " fails! ... aborting ..." << endl;
      exit(EXIT_FAILURE);
    }
  
  return;
}

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

nodeptr createNode(const int data)
{
  nodeptr pn{nullptr};
  pn = new struct node;

  assert(pn != nullptr);
  
  pn->data = data;
  pn->next = nullptr;

  return pn;
}

nodeptr createLinkedList()
{
  nodeptr root{nullptr};

  for (int size=N ; size>0 ; size--)
    {
      nodeptr pn = createNode(size);
      pn->next = root;
      root = pn;
    }

  assert(root != nullptr);
  
  return root;
}

void freeCpuLinkedList(nodeptr cpu_node)
{
  // free CPU linked list
  if (cpu_node)
    {
      freeCpuLinkedList(cpu_node->next);

      delete cpu_node;
      cpu_node = nullptr;
    }
  
  return;
}

void freeGpuLinkedList()
{
  // free GPU linked list
  for (auto &el : GPUptr)
    {
      cudaFree(el);
      el = nullptr;
    }

  return;
}

nodeptr copyNode(const nodeptr pn)
{
  assert(pn != nullptr);
  
  nodeptr gpu_node{nullptr};
  GPU_check(cudaMalloc((void **)&gpu_node, sizeof(*gpu_node)), "cudaMalloc");
  GPU_check(cudaMemcpy(gpu_node, pn, sizeof(*pn), cudaMemcpyHostToDevice), "cudaMemcpy");

  GPUptr.push_back(gpu_node);
  
  return gpu_node;
}

nodeptr copyList(const nodeptr pn)
{
  if (!pn)
    return nullptr;
  
  node buffer;
  buffer.next = copyList(pn->next);
  buffer.data = pn->data;

  return copyNode(&buffer);
}

// the function works on both CPU and GPU
__device__ __host__ void printLinkedList(const nodeptr root,
					 const int     device)
{
  const char string[2][16] = {"\n\t CPU ---> ", "\n\t GPU ---> "};

  if (root)
    {
      printf("%s", string[device]);
      printf("Node->data %d", root->data);
      printLinkedList(root->next, device);
    }
  else
    printf("\n");

  return;
}

__global__ void GpuPrintLinkedList(const nodeptr root)
{
  const unsigned int ID = threadIdx.x + (blockIdx.x * blockDim.x);

  // master thread prints the linked list
  if (!ID)
    printLinkedList(root, GPU);

  return;
}

int main()
{
  using namespace std;

  // create linked list on the CPU
  nodeptr head = createLinkedList();

  // print linked list on the CPU
  printLinkedList(head, CPU);

  // copy linked list onto the GPU
  nodeptr gpu_head = copyList(head);

  // print linked list on the GPU
  GpuPrintLinkedList<<< 1, BLOCKSIZE >>>(gpu_head);
  
  // GPU syncronization
  cudaDeviceSynchronize();

  // free CPU memory
  freeCpuLinkedList(head);

  // free GPU memory
  freeGpuLinkedList();
  
  return EXIT_SUCCESS;
}
