diff --git a/Makefile b/Makefile
index 61afb16b9a7132c84da5e7c269204bd52c4904b0..5384ac45203b7884b42f0e7be71d23f886dfca99 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ FFTWLIBS =
 OPT += -DUSE_FFTW
 
 # use omp-ized version of fftw routines
-OPT += -DHYBRID_FFTW
+#OPT += -DHYBRID_FFTW
 
 # write the full 3D cube of gridded visibilities and its FFT transform
 #OPT += -DWRITE_DATA
@@ -73,10 +73,10 @@ OPT += -DPHASE_ON
 #OPT += -DNVIDIA
 
 #use cuda for GPUs
-#OPT += -DCUDACC
+OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
-OPT += -DACCOMP
+#OPT += -DACCOMP
 
 # use NVIDIA GPU to perform the reduce
 #OPT += -DNCCL_REDUCE
@@ -85,7 +85,7 @@ OPT += -DACCOMP
 #OPT += -DRCCL_REDUCE
 
 # use GPU to perform FFT
-#OPT += -DCUFFTMP
+OPT += -DCUFFTMP
 
 #support for AMD GPUs
 #OPT += __HIP_PLATFORM_AMD__
@@ -141,8 +141,13 @@ OBJ_RCCL_REDUCE  = gridding_rccl.o
 
 # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
 #
-DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp
+ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
+DEPS_ACC_CUFFTMP = cuda_fft.cu 
+OBJ_ACC_CUFFTMP  = cuda_fft.o
+else
+DEPS_ACC_CUFFTMP = cuda_fft.cpp 
 OBJ_ACC_CUFFTMP  = cuda_fft.o
+endif
 
 
 # -----------------------------------------------------
@@ -173,13 +178,19 @@ w-stacking.c: w-stacking.cu
 
 phase_correction.c: phase_correction.cu
 	cp phase_correction.cu phase_correction.c
+
+cuda_fft.cpp: cuda_fft.cu
+	cp cuda_fft.cu cuda_fft.cpp
 else
 w-stacking.c: w-stacking.cu
-	rm -f w-stacking.cun
+	rm -f w-stacking.c
 	touch w-stacking.c
 phase_correction.c: phase_correction.cu
 	rm -f phase_correction.c
 	touch phase_correction.c
+cuda_fft.cpp: cuda_fft.cu
+	rm -f cuda_fft.cpp
+	touch cuda_fft.cpp
 endif
 
 
@@ -192,7 +203,7 @@ endif
 ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
 EXEC_EXT := $(EXEC_EXT)_acc-cuda
 LINKER=$(MPIC++)
-FLAGS=$(OPTIMIZE)
+FLAGS=$(OPTIMIZE) 
 LIBS=$(NVLIB)
 $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
 	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
@@ -250,15 +261,28 @@ OBJ += $(OBJ_RCCL_REDUCE)
 endif
 
 ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT)))
+
+ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
+EXEC_EXT := $(EXEC_EXT)_acc-fft
+LINKER=$(MPIC++)
+FLAGS=$(OPTIMIZE)
+LIBS=$(NVLIB_2)
+$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
+	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
+OBJ += $(OBJ_ACC_CUFFTMP)
+
+else
+
 EXEC_EXT := $(EXEC_EXT)_acc-fft
 LINKER=$(NVC++)
 FLAGS=$(NVFLAGS) $(CFLAGS)
-LIBS=$(NVLIB) $(NVLIB_2)
+LIBS=$(NVLIB_2)
 $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP)
 	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
 OBJ += $(OBJ_ACC_CUFFTMP)
 endif
 
+endif
 
 ###################################################################################
 
@@ -270,11 +294,11 @@ w-stacking: $(OBJ) $(DEPS) Makefile
 %.o: %.c $(DEPS)
 	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
 
-
 clean:
 	rm -f *.o
 	rm -f w-stacking.c
 	rm -f phase_correction.c
+	rm -f cuda_fft.cpp
 
 cleanall:
 	rm -f $(EXEC)$(EXT)
diff --git a/allvars.h b/allvars.h
index 8c8372257bcf1b552347914cd3371c1c93cbca26..9ebf91ad6d69f5d1a02c823485b7aecfa3d31b4e 100644
--- a/allvars.h
+++ b/allvars.h
@@ -34,9 +34,6 @@
 #include "w-stacking.h"
 #endif 
 
-#if defined(CUDACC)
-#include <cuda.h>
-#endif
 
 #if defined(NVIDIA)
 #include <cuda_runtime.h>
diff --git a/allvars_nccl.h b/allvars_nccl.h
index b48763ff98c19dd49faf674c481d0720a2eb0b60..032a26ef4f6b0f57d058827b9ee72310dfc6c999 100644
--- a/allvars_nccl.h
+++ b/allvars_nccl.h
@@ -13,7 +13,7 @@
 #include <unistd.h>
 
 
-#if !defined( NCCL_REDUCE )
+#if !defined( NCCL_REDUCE ) && !defined(__CUDACC__)
 #include <stdatomic.h>
 #endif
 
diff --git a/main.c b/main.c
index 4643c6345d954201ca25aafe05956766b0d213f4..d94e0b9c1b4f3b217e16149b6be51da8eec46842 100644
--- a/main.c
+++ b/main.c
@@ -3,7 +3,6 @@
 #include "allvars.h"
 #include "proto.h"
 
-
 void shutdown_wstacking( int errcode, char *message, char *fname, int linenum )
 {
   if ( ( rank == 0 ) &&
@@ -77,25 +76,6 @@ int main(int argc, char * argv[])
 
   FFT_INIT;    
 
- #if defined(CUDACC) 
-  int ndevices;
-  cudaGetDeviceCount(&ndevices);
-  cudaSetDevice(rank % ndevices);
-
-  if ( rank == 0 ) {
-    if (0 == ndevices) {
-      
-      shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
-    }
-  
-
-  printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
- #ifdef NVIDIA
-  prtAccelInfo();
- #endif
-  }
- #endif
-  
  #ifdef ACCOMP
   if ( rank == 0 ) {
     if (0 == omp_get_num_devices()) {
diff --git a/phase_correction.cu b/phase_correction.cu
index 37974d4f0027296c83578c0d22af674118928239..644f59d67ef8bb60b4dcbf974ba3be1f0905b149 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -11,6 +11,8 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include "errcodes.h"
+#include "proto.h"
 
 #ifdef __CUDACC__
 
@@ -106,7 +108,22 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 	int Nth = NTHREADS;
         long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
         if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
-        	
+
+	int ndevices;
+	cudaGetDeviceCount(&ndevices);
+	cudaSetDevice(rank % ndevices);
+
+	if ( rank == 0 ) {
+	  if (0 == ndevices) {
+
+	    shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
+	  }
+
+	}
+	  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
+	 #ifdef NVIDIA
+	  prtAccelInfo();
+	 #endif
 
 	cudaError_t mmm;
 	double * image_real_g;
diff --git a/w-stacking.cu b/w-stacking.cu
index d5aae633623201675006e897b5bbc612998f2957..c11f622614bbc1d49b37ef63f741a21584757fff 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -5,7 +5,15 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include "errcodes.h"
+
+#ifdef __CUDACC__
+#include "allvars_nccl.h"
+#else
 #include "allvars.h"
+#endif
+
+#include "proto.h"
 
 #ifdef ACCOMP
 #pragma omp  declare target
@@ -109,8 +117,7 @@ __global__ void convolve_g(
      int grid_size_x,
      int grid_size_y,
      double* grid,
-     double std22,
-     int rank)
+     double std22)
 
 {
 	//printf("DENTRO AL KERNEL\n");
@@ -226,12 +233,29 @@ void wstack(
 
     // Loop over visibilities.
 // Switch between CUDA and GPU versions
-#ifdef __CUDACC__
+   #ifdef __CUDACC__
     // Define the CUDA set up
     int Nth = NTHREADS;
     uint Nbl = (uint)(num_points/Nth) + 1;
     if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
     uint Nvis = num_points*freq_per_chan*polarizations;
+
+    int ndevices;
+    cudaGetDeviceCount(&ndevices);
+    cudaSetDevice(rank % ndevices);
+
+    if ( rank == 0 ) {
+      if (0 == ndevices) {
+
+	shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
+      }
+    }
+
+      printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
+     #ifdef NVIDIA
+      prtAccelInfo();
+     #endif
+
     printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
 
     // Create GPU arrays and offload them
@@ -264,7 +288,7 @@ void wstack(
     mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
     mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
     if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
-
+    
     // Call main GPU Kernel
     convolve_g <<<Nbl,Nth>>> (
 	       num_w_planes,
@@ -396,12 +420,13 @@ void wstack(
 		grid[iKer+1] += add_term_img;
             }
         }
-
+	
     }
-// End switch between CUDA and CPU versions
+    // End switch between CUDA and CPU versions
 #endif
     //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
 }
+
 #ifdef ACCOMP
 #pragma  omp end declare target
 #endif
diff --git a/w-stacking.h b/w-stacking.h
index c74fb06f770359e89c04346459c87329096ac2a9..f06bbbc0ae51dbea747afdb4d805d0c1cb6b8ece 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -8,8 +8,6 @@
 
 #include <mpi.h>
 
-
-
 #ifdef __CUDACC__
 extern "C"
 #endif