
/*
    Parallel reduction kernels
*/

#ifndef _REDUCE_KERNEL_H_
#define _REDUCE_KERNEL_H_

#include "my_cutil.h"

#ifdef __DEVICE_EMULATION__
#define EMUSYNC __syncthreads()
#else
#define EMUSYNC
#endif

////////////////////////////////////////////////////////////////////////////////
//   Notes for running in PUG:
//   Bitvector size: 24 bits
////////////////////////////////////////////////////////////////////////////////

// Macros to append an SM version identifier to a function name
// This allows us to compile a file multiple times for different architecture
// versions
// The second macro is necessary to evaluate the value of the SMVERSION macro
// rather than appending "SMVERSION" itself
#define FUNCVERSION(x, y) x ## _ ## y
#define XFUNCVERSION(x, y) FUNCVERSION(x, y)
#define FUNC(NAME) XFUNCVERSION(NAME, SMVERSION) 

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n threads
    - only works for power-of-2 arrays
*/

extern __shared__ int sdata[];

/*
    This version adds multiple elements per thread sequentially.  This reduces the overall
    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
    (Brent's Theorem optimization)
*/

__global__ void
  FUNC(reduce6kernel)(int *g_idata, int *g_odata, unsigned int n, unsigned int blockSize, bool nIsPow2)
{
  // SharedMemory<T> smem;
  //  T *sdata = smem.getPointer();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
    unsigned int gridSize = blockSize*2*gridDim.x;
    sdata[tid] = 0;

    // we reduce multiple elements per thread.  The number is determined by the 
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n)
    {         
        sdata[tid] += g_idata[i];
        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n) 
            sdata[tid] += g_idata[i+blockSize];  
        i += gridSize;
    } 
    __syncthreads();

    // do reduction in shared mem
    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } __syncthreads(); }
    
#if 0
#ifndef __DEVICE_EMULATION__
    if (tid < 32)
#endif
    {
//         if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }
//         if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }
//         if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; EMUSYNC; }
//         if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; EMUSYNC; }
//         if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; EMUSYNC; }
//         if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; EMUSYNC; }
    }
#else
       if (blockSize >=  64) { if (tid < 32) { sdata[tid] += sdata[tid + 32]; } __syncthreads(); }
       if (blockSize >=  32) { if (tid < 16) { sdata[tid] += sdata[tid + 16]; } __syncthreads(); }
       if (blockSize >=  16) { if (tid <  8) { sdata[tid] += sdata[tid +  8]; } __syncthreads(); }
       if (blockSize >=   8) { if (tid <  4) { sdata[tid] += sdata[tid +  4]; } __syncthreads(); }
        /* BUGINJECT: REMOVE_BARRIER, DOWN */
       if (blockSize >=   4) { 
         if (tid <  2) { sdata[tid] += sdata[tid +  2]; } 
#ifndef MUTATION
         __syncthreads(); 
#endif
       }
       if (blockSize >=   2) { if (tid <  1) { sdata[tid] += sdata[tid +  1]; } __syncthreads(); }
#endif
    
    // write result for this block to global mem 
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}


#endif // #ifndef _REDUCE_KERNEL_H_
