diff --git a/phase_correction.cu b/phase_correction.cu
index 95b2933df54ff129a26549b2928ed84715a02c05..4c26890ccb8f1e1205e0a5eb97f7dba8eda06463 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -108,12 +108,18 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 	double * gridss_g;
 
         mmm=cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
+	printf("CUDA ERROR 1 %s\n",cudaGetErrorString(mmm));
 	mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
+	printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
 	mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
+	printf("CUDA ERROR 3 %s\n",cudaGetErrorString(mmm));
 
 	mmm=cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice);
+	printf("CUDA ERROR 4 %s\n",cudaGetErrorString(mmm));
 	mmm=cudaMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
+	printf("CUDA ERROR 5 %s\n",cudaGetErrorString(mmm));
 	mmm=cudaMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
+	printf("CUDA ERROR 6 %s\n",cudaGetErrorString(mmm));
 
 	// call the phase correction kernel
 	phase_g <<<Nbl,Nth>>> (xaxis,
@@ -131,7 +137,9 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 			       nbucket);
 
 	mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
+	printf("CUDA ERROR 7 %s\n",cudaGetErrorString(mmm));
 	mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
+	printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));
 
 #else
 
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 5c6833d42208b37a512942e8b080f574370b86e5..0eda87e9b2dcef70714081129aff0fbeb468fae0 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -70,6 +70,7 @@ int main(int argc, char * argv[])
 	char logfile[30] = "run.log";
 	char extension[30] = ".txt";
 	char srank[4];
+	char timingfile[30] = "timings.dat";
 
 	double * uu;
 	double * vv;
@@ -940,6 +941,17 @@ int main(int argc, char * argv[])
 	  }
         }
 
+	if (rank == 0)
+	{
+	 pFile = fopen (timingfile,"w");
+	 if (num_threads == 1)
+         {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
+	 } else {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
+	 }  
+	 fclose(pFile);
+	} 
 
 	// Close MPI environment
 	#ifdef USE_MPI