#!/bin/bash
#SBATCH --nodes 1
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-task 32
#SBATCH --gres=gpu:1
#SBATCH --partition=boost_usr_prod
set -xev

export DEBUG=1
export NV_ACC_CUDA_STACKSIZE=15000

PPL=(1 8 16 32)
THREADS=(1 2 4 8 16 32)
MODE=(gpu cpu)
N=(1e7)
OMP_GPU=100 #number of particles per GPU thread

#
# test tree build on GPU and CPU
#
OMP_NUM_THREADS=4 CC='gcc' EC_FLAGS='-fopenmp -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp
OMP_NUM_THREADS=4 CC='nvc++' EC_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80  -Minfo=mp,accel -Minline=10000 -O3' srun -n 1 python -m hotwheels_core.wrap src/test_target.cpp

for ppl in "${PPL[@]}" ; do # loop over particles per leaf
    for n in "${N[@]}" ; do # loop over n of particles
        for mode in "${MODE[@]}" ; do #switch between gpu and gpu mode
            for threads in "${THREADS[@]}" ; do #loop over number of OpenMPThreads
                if [[ "$mode" == "cpu"  ]]; then
                    export CC=gcc
                    export EC_FLAGS='-O3 -fopenmp'
                    export OMP_NUM_THREADS=$threads
                else
                    if [[ $threads != 8 ]]; then
                       continue
                    fi
                    export CC=nvc++
                    export EC_FLAGS='-mp=gpu -gpu=rdc,managed -gpu=cc80  -Minfo=mp,accel -Minline=1000 -O3'
                    # gpu always uses 32 cpu cores but actually doesnt 
                    export OMP_NUM_THREADS=32 change much
                    threads=$(python -c "print(int($N//100))") #for the text hilight: "
                fi
                # run thre tree build
                srun -n 1 python src/testgpu.py -ppl $ppl -n $n -mode $mode -threads $threads -defrag 1 1>run.out 2>run.err
                # grep the timer and store
                t=$(grep time: run.out | sed s/time://)
                echo $ppl $n $mode $threads $t >> report.csv
            done
        done
    done
done