
#ifndef HISTOGRAM64_KERNEL_CUH
#define HISTOGRAM64_KERNEL_CUH

#include "my_cutil.h"


////////////////////////////////////////////////////////////////////////////////
//    Notes for running in PUG:
//   Bitvector size: must be 32 bits
//   The ASSUME_NO_OVFLO flag can be turned off to obtain substantial speedups
////////////////////////////////////////////////////////////////////////////////


//Total number of possible data values
#define      BIN_COUNT 64

////////////////////////////////////////////////////////////////////////////////
// Merge blockN histograms into gridDim.x histograms
// blockDim.x == BIN_COUNT
// gridDim.x  == BLOCK_N2
////////////////////////////////////////////////////////////////////////////////
#define MERGE_THREADS 64

__global__ void mergeHistogram64Kernel(
    unsigned int *d_Histogram,
    unsigned int *d_PartialHistograms,
    unsigned int blockN
){
    __shared__ unsigned int data[MERGE_THREADS];

    unsigned int sum = 0;
    for(unsigned int i = threadIdx.x; i < blockN; i += MERGE_THREADS)
        sum += d_PartialHistograms[blockIdx.x + i * BIN_COUNT];
    data[threadIdx.x] = sum;

    for(unsigned int stride = MERGE_THREADS / 2; stride > 0; stride >>= 1){
        __syncthreads();
        if(threadIdx.x < stride) {
#ifdef MUTATION
            __syncthreads();
#endif
            data[threadIdx.x] += data[threadIdx.x + stride];
        }
    }

    if(threadIdx.x == 0)
        d_Histogram[blockIdx.x] = data[0];
}


#endif
