#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>

#define BIN_COUNT 64

////////////////////////////////////////////////////////////////////////////////
// GPU-specific definitions
////////////////////////////////////////////////////////////////////////////////
//Fast mul on G8x / G9x / G100
#define IMUL(a, b) a * b

////////////////////////////////////////////////////////////////////////////////
// Merge blockN histograms into gridDim.x histograms
// blockDim.x == BIN_COUNT
// gridDim.x  == BLOCK_N2
////////////////////////////////////////////////////////////////////////////////
#define MERGE_THREADS 64

__axiom(blockDim.x == BIN_COUNT);

__global__ void mergeHistogram64Kernel(
    unsigned int *d_Histogram,
    unsigned int *d_PartialHistograms,
    unsigned int blockN
){
    __shared__ unsigned int data[MERGE_THREADS];

    unsigned int sum = 0;
    for(unsigned int i = threadIdx.x; i < blockN; i += MERGE_THREADS) {
        sum += d_PartialHistograms[blockIdx.x + i * BIN_COUNT];
    }
    data[threadIdx.x] = sum;

    for(unsigned int stride = MERGE_THREADS / 2;
#ifndef INFERENCE
      __invariant(stride == 32 | stride == 16 | stride == 8 | stride == 4 | stride == 2 | stride == 1 | stride == 0),
      __invariant(
        __write_implies(data,
          __write_offset(data) == sizeof(int)*threadIdx.x)),
      __invariant(__no_read(d_Histogram)),
      __invariant(__no_write(d_Histogram)),
      __invariant(__no_write(d_PartialHistograms)),
#endif
        stride > 0; stride >>= 1){
        __syncthreads();
         /* BUGINJECT: ADD_BARRIER, DOWN */
        if(threadIdx.x < stride) {
#ifdef MUTATION
            __syncthreads();
#endif
            data[threadIdx.x] += data[threadIdx.x + stride];
        }
    }

    if(threadIdx.x == 0)
        d_Histogram[blockIdx.x] = data[0];
}
