#define __1D_GRID
#define __1D_THREAD_BLOCK
#include <cuda.h>

#define N 1024
__axiom(blockDim.x == N/2);

/*
    Parallel reduction kernels
*/

#ifdef __DEVICE_EMULATION__
#define EMUSYNC __syncthreads()
#else
#define EMUSYNC
#endif

/*
    Parallel sum reduction using shared memory
    - takes log(n) steps for n input elements
    - uses n threads
    - only works for power-of-2 arrays
*/

//REVISIT: cannot refer to this array in invariants
//extern __shared__ int sdata[];

/*
    This version unrolls the last warp to avoid synchronization where it 
    isn't needed
*/

__global__ void reduce4kernel(int *g_idata, int *g_odata, unsigned int n, unsigned int blockSize)
{
    __requires(__is_pow2(blockDim.x));

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    __shared__ int sdata[N];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    sdata[tid] = (i < n) ? g_idata[i] : 0;
    if (i + blockSize < n) 
        sdata[tid] += g_idata[i+blockSize];  

    __syncthreads();

    // do reduction in shared mem
    for(unsigned int s=blockDim.x/2;
#ifndef INFERENCE
        __invariant(s >= 0),
        __invariant(s ==    8 | s == 16  | s ==  32 | s ==   64 |
                    s ==  128 | s == 256 | s == 512),
        __invariant(__no_read(g_odata)),
        __invariant(__no_write(g_odata)),
        __invariant(__no_read(sdata)),
        __invariant(__no_write(sdata)),
#endif
         s>32; s>>=1) 
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // GPUVerify cannot verify the following because it relies on warp specific reasoning
#if 0
#ifndef __DEVICE_EMULATION__
    if (tid < 32)
#endif
    {
       if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }
       if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }
       if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; EMUSYNC; }
       if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; EMUSYNC; }
       if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; EMUSYNC; }
       if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; EMUSYNC; }
    }
#else
       if (blockSize >=  64) { if (tid < 32) { sdata[tid] += sdata[tid + 32]; } __syncthreads(); }
       if (blockSize >=  32) { if (tid < 16) { sdata[tid] += sdata[tid + 16]; } __syncthreads(); }
       if (blockSize >=  16) { if (tid <  8) { sdata[tid] += sdata[tid +  8]; } __syncthreads(); }
       if (blockSize >=   8) { if (tid <  4) { sdata[tid] += sdata[tid +  4]; } __syncthreads(); }
        /* BUGINJECT: REMOVE_BARRIER, DOWN */
       if (blockSize >=   4) {
         if (tid <  2) { sdata[tid] += sdata[tid +  2]; }
#ifndef MUTATION
         __syncthreads(); 
#endif
       }
       if (blockSize >=   2) { if (tid <  1) { sdata[tid] += sdata[tid +  1]; } __syncthreads(); }
#endif

    // write result for this block to global mem 
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
