
#ifndef _SCAN_BEST_KERNEL_CU_
#define _SCAN_BEST_KERNEL_CU_

#include "my_cutil.h"


////////////////////////////////////////////////////////////////////////////////
//   Notes for running in PUG:
//   Bitvector size: 14 bits
//   Need to turn on automatic loop refinement
////////////////////////////////////////////////////////////////////////////////


// Define this to more rigorously avoid bank conflicts, 
// even at the lower (root) levels of the tree
// Note that due to the higher addressing overhead, performance 
// is lower with ZERO_BANK_CONFLICTS enabled.  It is provided
// as an example.
//#define ZERO_BANK_CONFLICTS 

// 16 banks on G80
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4

#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS + (index) >> (2*LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif

__global__ void uniformAdd_kernel(float *g_data, 
                           float *uniforms, 
                           int n, 
                           int blockOffset, 
                           int baseIndex)
{
    __shared__ float uni;
    if (threadIdx.x == 0)
        uni = uniforms[blockIdx.x + blockOffset];
    
    unsigned int address = blockIdx.x * (blockDim.x << 1) + baseIndex + threadIdx.x; 

    __syncthreads();
    
    // note two adds per thread
#ifdef MUTATION
    g_data[0]                    += uni;
#else
    g_data[address]              += uni;
#endif
    g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}

#endif // #ifndef _SCAN_BEST_KERNEL_CU_

